# Load libraries
library(plyr)
library(tidyverse)
library(edgeR)
library(AnnotationHub)
library(magrittr)
library(scales)
library(pander)
library(ggrepel)
library(fgsea)
library(pheatmap)
library(igraph)
library(tidygraph)
library(ggraph)
library(grid)
library(RUVSeq)
library(limma)
# Set working directory
setwd("~/Documents/Bioinformatics/Projects/20190122_Q96K97_NoStress_RNASeq/R")
# Load counts analysed by feature counts
counts <- read_tsv("../2_alignedData/featureCounts/genes.out") %>%
set_colnames(basename(colnames(.))) %>%
set_colnames(str_remove(colnames(.), "Aligned.sortedByCoord.out.bam")) %>%
dplyr::select(Geneid, starts_with("W"), starts_with("Q"))
# Create DGEList and calculate normalisaton factors
dgeList <- counts %>%
as.data.frame() %>%
column_to_rownames("Geneid") %>%
DGEList() %>%
calcNormFactors()
# Set group variable
dgeList$samples$group <- colnames(dgeList) %>%
str_extract("(W|Q)") %>%
factor(levels = c("W", "Q"))
# Add AnnotationHub and subset to search for zebrafish
ah <- AnnotationHub()
ah %>%
subset(species == "Danio rerio") %>%
subset(dataprovider == "Ensembl") %>%
subset(rdataclass == "EnsDb")
## AnnotationHub with 11 records
## # snapshotDate(): 2019-05-02
## # $dataprovider: Ensembl
## # $species: Danio rerio
## # $rdataclass: EnsDb
## # additional mcols(): taxonomyid, genome, description,
## # coordinate_1_based, maintainer, rdatadateadded, preparerclass,
## # tags, rdatapath, sourceurl, sourcetype
## # retrieve records with, e.g., 'object[["AH53201"]]'
##
## title
## AH53201 | Ensembl 87 EnsDb for Danio Rerio
## AH53705 | Ensembl 88 EnsDb for Danio Rerio
## AH56671 | Ensembl 89 EnsDb for Danio Rerio
## AH57746 | Ensembl 90 EnsDb for Danio Rerio
## AH60762 | Ensembl 91 EnsDb for Danio Rerio
## ... ...
## AH64434 | Ensembl 93 EnsDb for Danio Rerio
## AH64906 | Ensembl 94 EnsDb for Danio rerio
## AH67932 | Ensembl 95 EnsDb for Danio rerio
## AH69169 | Ensembl 96 EnsDb for Danio rerio
## AH73861 | Ensembl 97 EnsDb for Danio rerio
# Select correct Ensembl release
ensDb <- ah[["AH64906"]]
# Extract GenomicRanges object from ensDb
genesGR <- genes(ensDb)
# Remove redundant columns from mcols
mcols(genesGR) <- mcols(genesGR)[c("gene_id", "gene_name",
"gene_biotype", "entrezid")]
transGR <- transcripts(ensDb)
mcols(transGR) <- mcols(transGR)[c("tx_id", "gene_id")]
trans2Gene <- tibble(
tx_id = transGR$tx_id,
gene_id = transGR$gene_id
)
# Add genesGR to DGEList using rownames of DGEList to reorder the genesGR
dgeList$genes <- genesGR[rownames(dgeList),]
# Perform logical test to see how many genes were not detected in dataset
dgeList$counts %>%
rowSums() %>%
is_greater_than(0) %>%
table()
## .
## FALSE TRUE
## 3927 28130
# Check for genes having > 4 samples with cpm > 1
dgeList %>%
cpm() %>%
is_greater_than(1) %>%
rowSums() %>%
is_weakly_greater_than(4) %>%
table()
## .
## FALSE TRUE
## 13704 18353
# Create logical vector of genes to keep that fit criteria
genes2keep <- dgeList %>%
cpm() %>%
is_greater_than(1) %>%
rowSums() %>%
is_weakly_greater_than(4)
# Create new DGEList of genes fitting criteria
dgeFilt <- dgeList[genes2keep,, keep.lib.sizes = FALSE] %>%
calcNormFactors()
# Compare distributions of the DGELists before and after filtering
par(mfrow = c(1,2))
dgeList %>%
cpm(log = TRUE) %>%
plotDensities(legend = FALSE, main = "Before Filtering")
dgeFilt %>%
cpm(log = TRUE) %>%
plotDensities(legend = FALSE, main = "After Filtering")
par(mfrow = c(1,1))
# Check library sizes with box plot
dgeFilt$samples %>%
ggplot(aes(group, lib.size, fill = group)) +
geom_boxplot() +
scale_y_continuous(labels = comma) +
labs(x = "Genotype", y = "Library Size") +
scale_fill_discrete(
name ="Genotype",
labels = c("Wildtype", "Mutant")
) +
scale_x_discrete(labels=c("W" = "Wildtype", "Q" = "Mutant")) +
theme_bw()
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
pca <- dgeFilt %>%
cpm(log = TRUE) %>%
t() %>%
prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pca)$importance %>% pander(split.tables = Inf)
| Â | PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 |
|---|---|---|---|---|---|---|---|---|---|
| Standard deviation | 22.27 | 18.07 | 16.75 | 14.73 | 14.45 | 13.34 | 11.87 | 11.2 | 5.671e-14 |
| Proportion of Variance | 0.2513 | 0.1655 | 0.1421 | 0.1099 | 0.1058 | 0.09023 | 0.07145 | 0.06362 | 0 |
| Cumulative Proportion | 0.2513 | 0.4168 | 0.559 | 0.6689 | 0.7747 | 0.8649 | 0.9364 | 1 | 1 |
# Plot PCA
pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, PC1, PC2) %>%
left_join(rownames_to_column(dgeFilt$samples, "sample")) %>%
ggplot(aes(PC1, PC2, colour = group, label = sample)) +
geom_point() +
geom_text_repel(show.legend = FALSE) +
theme_bw()
# Create model matrix
design <- model.matrix(~group, data = dgeFilt$samples)
# Perform exact test on DGEList
topTable <- dgeFilt %>%
estimateDisp(design = design) %>%
exactTest() %>%
topTags(n = Inf) %>%
.$table %>%
as_tibble() %>%
unite("Range", ID.start, ID.end, sep = "-") %>%
unite("Location", ID.seqnames, Range, ID.strand, sep = ":") %>%
dplyr::select(
Geneid = ID.gene_id,
Symbol = ID.gene_name,
AveExpr = logCPM, logFC,
P.Value = PValue,
FDR, Location,
Entrez = ID.entrezid
) %>%
mutate(DE = FDR < 0.05)
# Volcano plot showing DE genes
topTable %>%
ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
geom_point(alpha = 0.5) +
geom_text_repel(data = . %>%
dplyr::filter(DE) %>%
dplyr::filter(-log10(P.Value) > 4 | abs(logFC) > 2.5), aes(label = Symbol)) +
scale_colour_manual(values = c("grey", "red")) +
theme_bw() +
theme(legend.position = "none")
# MD Plot showing DE genes
topTable %>%
dplyr::arrange(desc(P.Value)) %>%
ggplot(aes(AveExpr, logFC, colour = DE)) +
geom_point(alpha = 0.5) +
geom_text_repel(
data = . %>%
dplyr::filter(DE) %>%
dplyr::filter(abs(logFC) > 2 | AveExpr > 14),
aes(label = Symbol)
) +
scale_colour_manual(values = c("grey", "red")) +
labs(x = "Average Expression (log2 CPM)",
y = "log Fold-Change") +
theme_bw() +
theme(legend.position = "none")
# Summary of DE genes
topTableDE <- topTable %>%
dplyr::filter(FDR < 0.05) %>%
dplyr::select(Geneid, Symbol, AveExpr, logFC, P.Value, FDR)
topTableDE %>% pander(style = "rmarkdown", split.tables = Inf)
| Geneid | Symbol | AveExpr | logFC | P.Value | FDR |
|---|---|---|---|---|---|
| ENSDARG00000091368 | AL954327.1 | 2.656 | -5.923 | 1.214e-08 | 0.0002228 |
| ENSDARG00000093214 | si:ch211-284e13.9 | 0.8189 | 1.542 | 6.086e-07 | 0.005585 |
| ENSDARG00000037421 | egr1 | 8.527 | -0.712 | 9.887e-07 | 0.006049 |
| ENSDARG00000017246 | prx | 2.326 | -2.614 | 1.823e-06 | 0.008364 |
| ENSDARG00000089477 | si:ch211-132g1.3 | 5.855 | 0.6079 | 3.135e-06 | 0.01089 |
| ENSDARG00000089382 | zgc:158463 | 5.631 | 0.6536 | 3.561e-06 | 0.01089 |
| ENSDARG00000080337 | NC_002333.4 | 11.17 | 0.4449 | 5.344e-06 | 0.01401 |
| ENSDARG00000096829 | blvrb | 3.025 | -1.521 | 1.643e-05 | 0.03386 |
| ENSDARG00000093438 | CU467110.1 | 4.596 | 0.5459 | 1.752e-05 | 0.03386 |
| ENSDARG00000091916 | ugt5b4 | -0.0384 | -1.445 | 1.994e-05 | 0.03386 |
| ENSDARG00000036304 | dnaaf3l | 1.358 | -1.44 | 2.029e-05 | 0.03386 |
ens2Entrez <- file.path("https://uofabioinformaticshub.github.io/Intro-NGS-fib",
"data", "ens2Entrez.tsv") %>%
url() %>%
read_tsv()
de <- topTable %>%
dplyr::filter(FDR < 0.05) %>%
dplyr::select(Geneid) %>%
left_join(ens2Entrez) %>%
dplyr::filter(!is.na(Entrez)) %>%
.[["Entrez"]] %>%
unique()
uv <- topTable %>%
dplyr::select(Geneid) %>%
left_join(ens2Entrez) %>%
dplyr::filter(!is.na(Entrez)) %>%
.[["Entrez"]] %>%
unique()
goResults <- goana(de = de, universe = uv, species = "Hs")
goResults %>%
rownames_to_column("GO ID") %>%
as_tibble() %>%
dplyr::filter(DE > 1) %>%
dplyr::arrange(P.DE) %>%
mutate(FDR = p.adjust(P.DE, "fdr")) %>%
dplyr::filter(FDR < 0.05) %>%
mutate(`GO ID` = str_replace(`GO ID`, ":", "\\\\:")) %>%
pander(caption = "GO Terms potentially enriched in the set of differentially expressed genes")
| GO ID | Term | Ont | N | DE | P.DE | FDR |
# Load id conversion file
idConvert <- read_csv2("../files/zf2human_withEntrezIDs.csv") %>%
dplyr::select(Geneid = zfID, EntrezID = Entrez) %>%
mutate(EntrezID = as.character(EntrezID))
# Create function to convert ids (Not sure how this works, Steve wrote it)
convertHsEG2Dr <- function(ids, df = idConvert){
dplyr::filter(df, EntrezID %in% ids)$Geneid
}
# Conversion of zebrafish ensembl ID to zebrafish symbol, for plotting on network analyses
idConvertSymbol <- read_csv2("../files/zf2human_withEntrezIDs.csv") %>%
dplyr::select(label = zfID, symbol = zfName) %>%
na.omit() %>%
unique()
# Create named vector of gene level statistics
ranks <- topTable %>%
mutate(stat = -sign(logFC) * log10(P.Value)) %>%
dplyr::arrange(desc(stat)) %>%
with(structure(stat, names = Geneid))
# Import hallmark human gene genesets and tidy gene set names
# .gmt files downloaded from:
# http://software.broadinstitute.org/gsea/downloads.jsp
# http://data.wikipathways.org/20190610/
hallmark <- gmtPathways("../files/h.all.v6.2.entrez.gmt") %>%
mclapply(convertHsEG2Dr, mc.cores = 4) %>%
set_names(str_remove_all(names(.), "HALLMARK_"))
kegg <- gmtPathways("../files/c2.cp.kegg.v6.2.entrez.gmt") %>%
mclapply(convertHsEG2Dr, mc.cores = 4) %>%
set_names(str_remove_all(names(.), "KEGG_"))
wiki <- gmtPathways("../files/wikipathways-20190610-gmt-Homo_sapiens.gmt") %>%
mclapply(convertHsEG2Dr, mc.cores = 4) %>%
set_names(str_remove_all(names(.), "%.+"))
# Set seed to enable reproducibility
set.seed(22)
# Run GSEA for hallmark
fgseaHallmark <- fgsea(hallmark, ranks, nperm=1e5) %>%
as_tibble() %>%
dplyr::rename(FDR = padj) %>%
mutate(padj = p.adjust(pval, "bonferroni")) %>%
dplyr::arrange(pval)
# Create an object of pathways with adjusted p-value < 0.05 for construction of network diagrams. This should be done differently next time, but too much work has been done to change it now.
fgseaHallmarkTop <- fgseaHallmark %>%
dplyr::filter(padj < 0.05)
fgseaHallmarkTop %>%
dplyr::select(-leadingEdge, -nMoreExtreme) %>%
pander(
style = "rmarkdown",
split.tables = Inf,
justify = "lrrrrrr",
caption = paste(
"The", nrow(.), "most significantly enriched Hallmark pathways.",
"This corresponds to an FDR of", percent(max(.$FDR)))
)
| pathway | pval | FDR | ES | NES | size | padj |
|---|---|---|---|---|---|---|
| MYC_TARGETS_V1 | 1.236e-05 | 0.0001011 | 0.6076 | 2.418 | 220 | 0.0006178 |
| OXIDATIVE_PHOSPHORYLATION | 1.237e-05 | 0.0001011 | 0.6488 | 2.58 | 219 | 0.0006184 |
| INTERFERON_GAMMA_RESPONSE | 1.251e-05 | 0.0001011 | 0.5684 | 2.242 | 200 | 0.0006256 |
| E2F_TARGETS | 1.254e-05 | 0.0001011 | 0.464 | 1.827 | 197 | 0.0006268 |
| ALLOGRAFT_REJECTION | 1.256e-05 | 0.0001011 | 0.5472 | 2.153 | 195 | 0.0006278 |
| DNA_REPAIR | 1.304e-05 | 0.0001011 | 0.5417 | 2.069 | 150 | 0.0006519 |
| INTERFERON_ALPHA_RESPONSE | 1.415e-05 | 0.0001011 | 0.5773 | 2.034 | 83 | 0.0007075 |
| MYOGENESIS | 5.222e-05 | 0.0003264 | -0.4095 | -1.859 | 216 | 0.002611 |
| WNT_BETA_CATENIN_SIGNALING | 6.038e-05 | 0.0003354 | -0.5614 | -2.034 | 52 | 0.003019 |
| ADIPOGENESIS | 0.0001853 | 0.0008935 | 0.4063 | 1.617 | 220 | 0.009267 |
| MTORC1_SIGNALING | 0.0001966 | 0.0008935 | 0.4066 | 1.624 | 228 | 0.009829 |
| G2M_CHECKPOINT | 0.0005689 | 0.002371 | 0.3955 | 1.572 | 216 | 0.02845 |
| KRAS_SIGNALING_DN | 0.0007387 | 0.002841 | -0.3669 | -1.599 | 155 | 0.03694 |
# Make a table plot of significant Hallmark pathways
if (interactive()) grid::grid.newpage()
plotGseaTable(
hallmark[fgseaHallmarkTop$pathway], ranks, fgseaHallmark, gseaParam = 0.5
)
# Set seed to enable reproducibility
set.seed(22)
# Run GSEA for KEGG
fgseaKEGG <- fgsea(kegg, ranks, nperm=1e5) %>%
as_tibble() %>%
dplyr::rename(FDR = padj) %>%
mutate(padj = p.adjust(pval, "bonferroni")) %>%
dplyr::arrange(pval)
# Create an object of pathways with adjusted p-value < 0.05 for construction of network diagrams. This should be done differently next time, but too much work has been done to change it now.
fgseaKEGGTop <- fgseaKEGG %>%
dplyr::filter(padj < 0.05)
fgseaKEGGTop %>%
dplyr::select(-leadingEdge, -nMoreExtreme) %>%
pander(
style = "rmarkdown",
split.tables = Inf,
justify = "lrrrrrr",
caption = paste(
"The", nrow(.), "most significantly enriched KEGG pathways.",
"This corresponds to an FDR of", percent(max(.$FDR)))
)
| pathway | pval | FDR | ES | NES | size | padj |
|---|---|---|---|---|---|---|
| CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION | 1.207e-05 | 0.0002836 | 0.4479 | 1.812 | 260 | 0.002246 |
| CHEMOKINE_SIGNALING_PATHWAY | 1.268e-05 | 0.0002836 | 0.6442 | 2.524 | 187 | 0.002358 |
| HUNTINGTONS_DISEASE | 1.276e-05 | 0.0002836 | 0.5849 | 2.281 | 180 | 0.002373 |
| ALZHEIMERS_DISEASE | 1.293e-05 | 0.0002836 | 0.555 | 2.141 | 164 | 0.002406 |
| SPLICEOSOME | 1.333e-05 | 0.0002836 | 0.6179 | 2.32 | 131 | 0.002479 |
| OXIDATIVE_PHOSPHORYLATION | 1.337e-05 | 0.0002836 | 0.7172 | 2.685 | 128 | 0.002486 |
| PARKINSONS_DISEASE | 1.344e-05 | 0.0002836 | 0.7061 | 2.63 | 123 | 0.002501 |
| RIBOSOME | 1.426e-05 | 0.0002836 | 0.8662 | 3.032 | 79 | 0.002653 |
| NOD_LIKE_RECEPTOR_SIGNALING_PATHWAY | 1.468e-05 | 0.0002836 | 0.6316 | 2.126 | 62 | 0.00273 |
| PROTEASOME | 1.525e-05 | 0.0002836 | 0.7112 | 2.259 | 45 | 0.002836 |
| LINOLEIC_ACID_METABOLISM | 2.872e-05 | 0.0004856 | -0.6266 | -2.178 | 43 | 0.005342 |
| ECM_RECEPTOR_INTERACTION | 3.345e-05 | 0.0005089 | -0.5644 | -2.212 | 79 | 0.006222 |
| ASCORBATE_AND_ALDARATE_METABOLISM | 4.629e-05 | 0.0005089 | -0.7435 | -3.304 | 181 | 0.00861 |
| PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS | 4.688e-05 | 0.0005089 | -0.7556 | -3.364 | 184 | 0.008719 |
| STARCH_AND_SUCROSE_METABOLISM | 4.85e-05 | 0.0005089 | -0.7369 | -3.307 | 196 | 0.00902 |
| PORPHYRIN_AND_CHLOROPHYLL_METABOLISM | 4.877e-05 | 0.0005089 | -0.7057 | -3.175 | 199 | 0.009071 |
| STEROID_HORMONE_BIOSYNTHESIS | 5.1e-05 | 0.0005089 | -0.695 | -3.152 | 213 | 0.009486 |
| FOCAL_ADHESION | 5.198e-05 | 0.0005089 | -0.4533 | -2.062 | 219 | 0.009669 |
| DRUG_METABOLISM_OTHER_ENZYMES | 5.222e-05 | 0.0005089 | -0.6915 | -3.146 | 220 | 0.009714 |
| DRUG_METABOLISM_CYTOCHROME_P450 | 5.856e-05 | 0.0005089 | -0.6977 | -3.24 | 262 | 0.01089 |
| RETINOL_METABOLISM | 5.896e-05 | 0.0005089 | -0.6954 | -3.231 | 264 | 0.01097 |
| METABOLISM_OF_XENOBIOTICS_BY_CYTOCHROME_P450 | 6.019e-05 | 0.0005089 | -0.6907 | -3.216 | 270 | 0.01119 |
| PATHWAYS_IN_CANCER | 7.027e-05 | 0.0005683 | -0.3296 | -1.568 | 334 | 0.01307 |
| SYSTEMIC_LUPUS_ERYTHEMATOSUS | 7.645e-05 | 0.0005925 | 0.61 | 1.93 | 44 | 0.01422 |
| AXON_GUIDANCE | 0.0001317 | 0.0009798 | -0.3794 | -1.663 | 162 | 0.0245 |
| JAK_STAT_SIGNALING_PATHWAY | 0.0001705 | 0.00122 | -0.3899 | -1.693 | 151 | 0.03172 |
| ARRHYTHMOGENIC_RIGHT_VENTRICULAR_CARDIOMYOPATHY_ARVC | 0.0002086 | 0.001437 | -0.4541 | -1.817 | 89 | 0.03881 |
| PATHOGENIC_ESCHERICHIA_COLI_INFECTION | 0.000225 | 0.001495 | 0.5804 | 1.894 | 52 | 0.04185 |
| NEUROACTIVE_LIGAND_RECEPTOR_INTERACTION | 0.0002659 | 0.001706 | -0.3377 | -1.544 | 228 | 0.04946 |
# Make a table plot of significant KEGG pathways
if (interactive()) grid::grid.newpage()
plotGseaTable(
kegg[fgseaKEGGTop$pathway], ranks, fgseaKEGG, gseaParam = 0.5
)
# Set seed to enable reproducibility
set.seed(22)
# Run GSEA for WikiPathways
fgseaWiki <- fgsea(wiki, ranks, nperm=1e5) %>%
as_tibble() %>%
dplyr::rename(FDR = padj) %>%
mutate(padj = p.adjust(pval, "bonferroni")) %>%
dplyr::arrange(pval)
# Create an object of pathways with adjusted p-value < 0.05 for construction of network diagrams. This should be done differently next time, but too much work has been done to change it now.
fgseaWikiTop <- fgseaWiki %>%
dplyr::filter(padj < 0.05)
fgseaWikiTop %>%
dplyr::select(-leadingEdge, -nMoreExtreme) %>%
pander(
style = "rmarkdown",
split.tables = Inf,
justify = "lrrrrrr",
caption = paste(
"The", nrow(.), "most significantly enriched Wiki pathways.",
"This corresponds to an FDR of", percent(max(.$FDR)))
)
| pathway | pval | FDR | ES | NES | size | padj |
|---|---|---|---|---|---|---|
| Chemokine signaling pathway | 1.284e-05 | 0.0009715 | 0.5945 | 2.303 | 170 | 0.006691 |
| Nonalcoholic fatty liver disease | 1.299e-05 | 0.0009715 | 0.5834 | 2.239 | 156 | 0.006768 |
| mRNA Processing | 1.328e-05 | 0.0009715 | 0.5371 | 2.022 | 133 | 0.006919 |
| Electron Transport Chain (OXPHOS system in mitochondria) | 1.382e-05 | 0.0009715 | 0.7417 | 2.677 | 97 | 0.007198 |
| Cytoplasmic Ribosomal Proteins | 1.416e-05 | 0.0009715 | 0.8704 | 3.052 | 80 | 0.007377 |
| Parkin-Ubiquitin Proteasomal System pathway | 1.436e-05 | 0.0009715 | 0.6106 | 2.101 | 71 | 0.007483 |
| Oxidative phosphorylation | 1.466e-05 | 0.0009715 | 0.7527 | 2.517 | 60 | 0.007635 |
| Mitochondrial complex I assembly model OXPHOS system | 1.492e-05 | 0.0009715 | 0.7502 | 2.444 | 52 | 0.007772 |
| Nicotine Metabolism | 2.591e-05 | 0.001041 | -0.768 | -2.255 | 21 | 0.0135 |
| Striated Muscle Contraction Pathway | 2.788e-05 | 0.001041 | -0.6834 | -2.256 | 34 | 0.01452 |
| Irinotecan Pathway | 2.848e-05 | 0.001041 | -0.7029 | -2.396 | 39 | 0.01484 |
| Proteasome Degradation | 2.926e-05 | 0.001041 | 0.5898 | 1.979 | 61 | 0.01525 |
| Estrogen metabolism | 3.135e-05 | 0.001041 | -0.645 | -2.398 | 59 | 0.01633 |
| Pregnane X Receptor pathway | 3.404e-05 | 0.001041 | -0.6214 | -2.444 | 80 | 0.01773 |
| Constitutive Androstane Receptor Pathway | 3.417e-05 | 0.001041 | -0.6225 | -2.452 | 81 | 0.0178 |
| Tamoxifen metabolism | 3.485e-05 | 0.001041 | -0.6519 | -2.594 | 86 | 0.01816 |
| Aryl Hydrocarbon Receptor Pathway | 3.544e-05 | 0.001041 | -0.6158 | -2.472 | 91 | 0.01846 |
| Codeine and Morphine Metabolism | 3.598e-05 | 0.001041 | -0.7161 | -2.895 | 95 | 0.01875 |
| TGF-beta Signaling Pathway | 4.232e-05 | 0.00116 | -0.4216 | -1.822 | 148 | 0.02205 |
| Glucuronidation | 4.759e-05 | 0.001205 | -0.7632 | -3.401 | 187 | 0.02479 |
| Focal Adhesion | 5.09e-05 | 0.001205 | -0.4598 | -2.085 | 213 | 0.02652 |
| NRF2 pathway | 5.09e-05 | 0.001205 | -0.4504 | -2.043 | 213 | 0.02652 |
| Metapathway biotransformation Phase I and II | 7.519e-05 | 0.001703 | -0.5239 | -2.51 | 360 | 0.03918 |
| Nuclear Receptors Meta-Pathway | 8.361e-05 | 0.001815 | -0.3388 | -1.64 | 404 | 0.04356 |
# Make a table plot of significant WikiPathways pathways
if (interactive()) grid::grid.newpage()
plotGseaTable(
wiki[fgseaWikiTop$pathway], ranks, fgseaWiki, gseaParam = 0.5
)
# Create design matrix
design <- model.matrix(~group, data = dgeFilt$samples)
# Create voom object
voom <- voom(dgeFilt, design)
# Create index for fry
fryIndex <- ids2indices(hallmark, rownames(dgeFilt$counts))
# Run fry
fry(voom, index = fryIndex, design = design) %>%
rownames_to_column("pathway") %>%
dplyr::arrange(PValue.Mixed) %>%
as.data.frame() %>%
column_to_rownames("pathway")
## NGenes Direction PValue FDR
## ALLOGRAFT_REJECTION 162 Up 0.07596489 0.4567829
## INTERFERON_GAMMA_RESPONSE 174 Up 0.11167661 0.4567829
## INTERFERON_ALPHA_RESPONSE 74 Up 0.16768009 0.4567829
## APICAL_SURFACE 50 Down 0.50885615 0.6876434
## INFLAMMATORY_RESPONSE 160 Down 0.56905689 0.6939718
## TNFA_SIGNALING_VIA_NFKB 185 Up 0.37021201 0.6309383
## WNT_BETA_CATENIN_SIGNALING 48 Down 0.02324950 0.4567829
## NOTCH_SIGNALING 38 Up 0.24181096 0.4836219
## COMPLEMENT 191 Up 0.44013368 0.6381304
## IL2_STAT5_SIGNALING 198 Up 0.39742195 0.6309383
## P53_PATHWAY 211 Up 0.39066425 0.6309383
## E2F_TARGETS 197 Up 0.16026231 0.4567829
## IL6_JAK_STAT3_SIGNALING 68 Up 0.40380054 0.6309383
## FATTY_ACID_METABOLISM 177 Up 0.07310109 0.4567829
## KRAS_SIGNALING_UP 198 Down 0.80779747 0.8764850
## DNA_REPAIR 150 Up 0.07456667 0.4567829
## MYC_TARGETS_V1 220 Up 0.17316997 0.4567829
## SPERMATOGENESIS 113 Up 0.29564563 0.5474919
## HEME_METABOLISM 183 Up 0.96210162 0.9621016
## APOPTOSIS 153 Up 0.22395939 0.4812092
## ADIPOGENESIS 220 Up 0.13806564 0.4567829
## G2M_CHECKPOINT 216 Up 0.19928842 0.4744962
## OXIDATIVE_PHOSPHORYLATION 219 Up 0.17907727 0.4567829
## KRAS_SIGNALING_DN 154 Down 0.12383031 0.4567829
## MYOGENESIS 215 Down 0.16803951 0.4567829
## APICAL_JUNCTION 210 Down 0.13454047 0.4567829
## UV_RESPONSE_UP 171 Up 0.43866048 0.6381304
## UNFOLDED_PROTEIN_RESPONSE 125 Up 0.36669753 0.6309383
## COAGULATION 109 Up 0.68906753 0.8012413
## EPITHELIAL_MESENCHYMAL_TRANSITION 212 Down 0.17710119 0.4567829
## XENOBIOTIC_METABOLISM 192 Up 0.82389592 0.8764850
## GLYCOLYSIS 215 Up 0.12093542 0.4567829
## PROTEIN_SECRETION 101 Down 0.63381732 0.7545444
## MITOTIC_SPINDLE 245 Down 0.54282931 0.6939718
## PEROXISOME 119 Up 0.48829625 0.6781892
## PI3K_AKT_MTOR_SIGNALING 120 Down 0.54360742 0.6939718
## ESTROGEN_RESPONSE_LATE 195 Up 0.44669131 0.6381304
## REACTIVE_OXIGEN_SPECIES_PATHWAY 49 Up 0.23098040 0.4812092
## HYPOXIA 212 Down 0.55836872 0.6939718
## BILE_ACID_METABOLISM 106 Down 0.86005207 0.8776042
## MTORC1_SIGNALING 225 Up 0.26835620 0.5160696
## UV_RESPONSE_DN 165 Down 0.18271314 0.4567829
## ANGIOGENESIS 36 Down 0.02050403 0.4567829
## ANDROGEN_RESPONSE 103 Up 0.74186501 0.8242945
## ESTROGEN_RESPONSE_EARLY 196 Down 0.21886178 0.4812092
## MYC_TARGETS_V2 58 Up 0.71131129 0.8083083
## PANCREAS_BETA_CELLS 37 Up 0.03061771 0.4567829
## TGF_BETA_SIGNALING 58 Down 0.17484418 0.4567829
## HEDGEHOG_SIGNALING 44 Down 0.15130253 0.4567829
## CHOLESTEROL_HOMEOSTASIS 81 Down 0.84782517 0.8776042
## PValue.Mixed FDR.Mixed
## ALLOGRAFT_REJECTION 0.01848628 0.3566740
## INTERFERON_GAMMA_RESPONSE 0.03181392 0.3566740
## INTERFERON_ALPHA_RESPONSE 0.04321697 0.3566740
## APICAL_SURFACE 0.04780344 0.3566740
## INFLAMMATORY_RESPONSE 0.07494033 0.3566740
## TNFA_SIGNALING_VIA_NFKB 0.07518999 0.3566740
## WNT_BETA_CATENIN_SIGNALING 0.09854460 0.3566740
## NOTCH_SIGNALING 0.10005755 0.3566740
## COMPLEMENT 0.10096802 0.3566740
## IL2_STAT5_SIGNALING 0.11274573 0.3566740
## P53_PATHWAY 0.11676611 0.3566740
## E2F_TARGETS 0.11703767 0.3566740
## IL6_JAK_STAT3_SIGNALING 0.12870020 0.3566740
## FATTY_ACID_METABOLISM 0.13375804 0.3566740
## KRAS_SIGNALING_UP 0.13988853 0.3566740
## DNA_REPAIR 0.15004059 0.3566740
## MYC_TARGETS_V1 0.15209394 0.3566740
## SPERMATOGENESIS 0.15402406 0.3566740
## HEME_METABOLISM 0.16080727 0.3566740
## APOPTOSIS 0.16343028 0.3566740
## ADIPOGENESIS 0.17038535 0.3566740
## G2M_CHECKPOINT 0.17155957 0.3566740
## OXIDATIVE_PHOSPHORYLATION 0.17280831 0.3566740
## KRAS_SIGNALING_DN 0.17877775 0.3566740
## MYOGENESIS 0.17973694 0.3566740
## APICAL_JUNCTION 0.19076816 0.3566740
## UV_RESPONSE_UP 0.20610861 0.3566740
## UNFOLDED_PROTEIN_RESPONSE 0.20906746 0.3566740
## COAGULATION 0.21054311 0.3566740
## EPITHELIAL_MESENCHYMAL_TRANSITION 0.21934188 0.3566740
## XENOBIOTIC_METABOLISM 0.22499811 0.3566740
## GLYCOLYSIS 0.22989067 0.3566740
## PROTEIN_SECRETION 0.23540487 0.3566740
## MITOTIC_SPINDLE 0.24732601 0.3621172
## PEROXISOME 0.25760465 0.3621172
## PI3K_AKT_MTOR_SIGNALING 0.26072436 0.3621172
## ESTROGEN_RESPONSE_LATE 0.27065963 0.3657563
## REACTIVE_OXIGEN_SPECIES_PATHWAY 0.29382781 0.3789689
## HYPOXIA 0.30760921 0.3789689
## BILE_ACID_METABOLISM 0.31055062 0.3789689
## MTORC1_SIGNALING 0.31075447 0.3789689
## UV_RESPONSE_DN 0.33700085 0.4011915
## ANGIOGENESIS 0.35220166 0.4047468
## ANDROGEN_RESPONSE 0.35617719 0.4047468
## ESTROGEN_RESPONSE_EARLY 0.36628246 0.4069805
## MYC_TARGETS_V2 0.37701893 0.4098032
## PANCREAS_BETA_CELLS 0.39298923 0.4180736
## TGF_BETA_SIGNALING 0.41119435 0.4205139
## HEDGEHOG_SIGNALING 0.41210367 0.4205139
## CHOLESTEROL_HOMEOSTASIS 0.62892117 0.6289212
# Load significant pathways with ONLY leading edge genes determined from GSEA analysis
sigHallmark <-
fgseaHallmarkTop %>%
split(f = .$pathway) %>%
lapply(extract2, "leadingEdge") %>%
lapply(unlist)
# Create a node list
pathwaysHallmark <- names(sigHallmark) %>%
as.data.frame() %>%
set_colnames("label") %>%
mutate(label = as.character(label))
genesHallmark <- unique(unlist(sigHallmark)) %>%
as.data.frame() %>%
set_colnames("label") %>%
mutate(label = as.character(label))
nodesHallmark <- full_join(pathwaysHallmark, genesHallmark, by = "label") %>%
rowid_to_column("id")
# Then create an edge list
edgesHallmark <- ldply(sigHallmark, data.frame) %>%
set_colnames(c("pathway", "gene")) %>%
mutate(gene = as.character(gene)) %>%
left_join(nodesHallmark, by = c("pathway" = "label")) %>%
dplyr::rename(from = id) %>%
left_join(nodesHallmark, by = c("gene" = "label")) %>%
dplyr::rename(to = id) %>%
dplyr::select(from, to)
# Create tidygraph object
tidyHallmark <-
tbl_graph(nodes = nodesHallmark, edges = edgesHallmark, directed = FALSE) %>%
activate(nodes) %>%
left_join(idConvertSymbol, by = "label") %>%
mutate(
pathways = case_when(
id <= nrow(fgseaHallmarkTop) ~ label
),
DE = case_when(
label %in% topTableDE$Geneid ~ symbol
),
size = case_when(
label %in% topTable$Geneid ~
as.integer(row_number(label %in% topTable$Geneid)),
id <= nrow(fgseaHallmarkTop) ~ as.integer(4000)
),
colour = case_when(
id <= nrow(fgseaHallmarkTop) ~ rainbow(nrow(fgseaHallmarkTop))[id],
label %in% topTableDE$Geneid ~ "black"),
hjust = case_when(
DE == "ugt5b4" ~ as.integer(0)),
vjust = case_when(DE == "ugt5b4" ~ as.integer(5))
) %>%
activate(edges) %>%
mutate(
colour = case_when(
from <= nrow(fgseaHallmarkTop) ~ rainbow(nrow(fgseaHallmarkTop))[from]
)
)
# Set seed to enable reproducibility (seed selected to create graph with non-overlapping labels)
set.seed(22)
# Plot graph
ggraph(tidyHallmark, layout = "fr") +
scale_fill_manual(
values = c(rainbow(nrow(fgseaHallmarkTop)), "black"),
na.value = "gray80"
) +
geom_edge_arc(
aes(color = colour),
alpha = 0.5,
show.legend = FALSE,
curvature = 0.5
) +
geom_node_point(
aes(size = size, fill = colour),
shape = 21,
stroke = 0.5,
show.legend = FALSE
) +
geom_node_label(
aes(label = pathways),
repel = TRUE,
size = 3,
alpha = 0.7,
label.padding = 0.1
) +
geom_node_text(
aes(label = DE, hjust = hjust, vjust = vjust),
repel = TRUE,
size = 3,
alpha = 0.8,
colour = "black"
) +
theme_graph() +
theme(legend.position = "none")
# Load significant pathways with ONLY leading edge genes determined from GSEA analysis
sigKEGG <-
fgseaKEGGTop %>%
split(f = .$pathway) %>%
lapply(extract2, "leadingEdge") %>%
lapply(unlist)
# Create a node list
pathwaysKEGG <- names(sigKEGG) %>%
as.data.frame() %>%
set_colnames("label") %>%
mutate(label = as.character(label))
genesKEGG <- unique(unlist(sigKEGG)) %>%
as.data.frame() %>%
set_colnames("label") %>%
mutate(label = as.character(label))
nodesKEGG <- full_join(pathwaysKEGG, genesKEGG, by = "label") %>%
rowid_to_column("id")
# Then create an edge list
edgesKEGG <- ldply(sigKEGG, data.frame) %>%
set_colnames(c("pathway", "gene")) %>%
mutate(gene = as.character(gene)) %>%
left_join(nodesKEGG, by = c("pathway" = "label")) %>%
dplyr::rename(from = id) %>%
left_join(nodesKEGG, by = c("gene" = "label")) %>%
dplyr::rename(to = id) %>%
dplyr::select(from, to)
# Create tidygraph object
tidyKEGG <-
tbl_graph(nodes = nodesKEGG, edges = edgesKEGG, directed = FALSE) %>%
activate(nodes) %>%
left_join(idConvertSymbol, by = "label") %>%
mutate(
pathways = case_when(
id <= nrow(fgseaKEGGTop) ~ label
),
DE = case_when(label %in% topTableDE$Geneid ~ symbol),
size = case_when(
label %in% topTable$Geneid ~ row_number(label %in% topTable$Geneid),
id <= nrow(fgseaKEGGTop) ~ 4000L
) %>% as.integer(),
colour = case_when(
id <= nrow(fgseaKEGGTop) ~ rainbow(nrow(fgseaKEGGTop))[id],
label %in% topTableDE$Geneid ~ "black"
),
hjust = case_when(
DE == "ugt5b4" ~ -1L,
DE == "blvrb" ~ 7L
),
vjust = case_when(
DE == "ugt5b4" ~ 7L,
DE == "blvrb" ~ 0L
)
) %>%
activate(edges) %>%
mutate(
colour = case_when(
from <= nrow(fgseaKEGGTop) ~ rainbow(nrow(fgseaKEGGTop))[from]
)
)
# Set seed to enable reproducibility (seed selected to create graph with non-overlapping labels)
set.seed(26)
# Plot graph
ggraph(tidyKEGG, layout = "fr") +
scale_fill_manual(
values = c(rainbow(nrow(fgseaKEGGTop)), "black"),
na.value = "gray80"
) +
geom_edge_arc(
aes(color = colour),
alpha = 0.5,
show.legend = FALSE,
curvature = 0.5
) +
geom_node_point(
aes(size = size, fill = colour),
shape = 21,
stroke = 0.5,
show.legend = FALSE
) +
geom_node_label(
aes(label = pathways),
repel = TRUE,
size = 3,
alpha = 0.7,
label.padding = 0.1
) +
geom_node_text(
aes(label = DE, hjust = hjust, vjust = vjust),
repel = TRUE,
size = 3,
alpha = 0.8,
colour = "black"
) +
theme_graph() +
theme(legend.position = "none")
# Load significant pathways with ONLY leading edge genes determined from GSEA analysis
sigWiki <-
fgseaWikiTop %>%
split(f = .$pathway) %>%
lapply(extract2, "leadingEdge") %>%
lapply(unlist)
# Create a node list
pathwaysWiki <- names(sigWiki) %>%
as.data.frame() %>%
set_colnames("label") %>%
mutate(label = as.character(label))
genesWiki <- unique(unlist(sigWiki)) %>%
as.data.frame() %>%
set_colnames("label") %>%
mutate(label = as.character(label))
nodesWiki <- full_join(pathwaysWiki, genesWiki, by = "label") %>%
rowid_to_column("id")
# Then create an edge list
edgesWiki <- ldply(sigWiki, data.frame) %>%
set_colnames(c("pathway", "gene")) %>%
mutate(gene = as.character(gene)) %>%
left_join(nodesWiki, by = c("pathway" = "label")) %>%
dplyr::rename(from = id) %>%
left_join(nodesWiki, by = c("gene" = "label")) %>%
dplyr::rename(to = id) %>%
dplyr::select(from, to)
# Create tidygraph object
tidyWiki <-
tbl_graph(nodes = nodesWiki, edges = edgesWiki, directed = FALSE) %>%
activate(nodes) %>%
left_join(idConvertSymbol, by = "label") %>%
mutate(
pathways = case_when(
id <= nrow(fgseaWikiTop) ~ label
),
DE = case_when(
label %in% topTableDE$Geneid ~ symbol
),
size = case_when(
label %in% topTable$Geneid ~
as.integer(row_number(label %in% topTable$Geneid)),
id <= nrow(fgseaWikiTop) ~ as.integer(4000)
),
colour = case_when(
id <= nrow(fgseaWikiTop) ~ rainbow(nrow(fgseaWikiTop))[id],
label %in% topTableDE$Geneid ~ "black"
),
hjust = case_when(
DE == "ugt5b4" ~ as.integer(1),
DE == "blvrb" ~ as.integer(2)
),
vjust = case_when(
DE == "ugt5b4" ~ as.integer(7),
DE == "blvrb" ~ as.integer(7)
)
) %>%
activate(edges) %>%
mutate(
colour = case_when(
from <= nrow(fgseaWikiTop) ~ rainbow(nrow(fgseaWikiTop))[from]
)
)
# Set seed to enable reproducibility (seed selected to create graph with non-overlapping labels)
set.seed(27)
# Plot graph
ggraph(tidyWiki, layout = "fr") +
scale_fill_manual(
values = c(rainbow(nrow(fgseaWikiTop)), "black"),
na.value = "gray80"
) +
geom_edge_arc(
aes(color = colour),
alpha = 0.5,
show.legend = FALSE,
curvature = 0.5
) +
geom_node_point(
aes(size = size, fill = colour),
shape = 21,
stroke = 0.5,
show.legend = FALSE
) +
geom_node_label(
aes(label = pathways),
repel = TRUE,
size = 3,
alpha = 0.7,
label.padding = 0.1
) +
geom_node_text(
aes(label = DE, hjust = hjust, vjust = vjust),
repel = TRUE,
size = 3,
alpha = 0.8,
colour = "black"
) +
theme_graph() +
theme(legend.position = "none")
# Extract subset of low expressed genes from DE analysis to act as negative controls for RUVg procedure
negControl <- topTable %>%
dplyr::arrange(desc(P.Value)) %>%
.[1:10000,] %>%
.$Geneid
# Run RUVSeq
RUVg <- RUVg(dgeFilt$counts, negControl, 1)
# Create copy of dgeFilt as framework to replace RUVSeq results into
dgeFalse <- dgeFilt
# Replace with results
dgeFalse$counts <- RUVg$normalizedCounts
# Run PCA function
pcaRUVg <- dgeFalse %>%
cpm(log = TRUE) %>%
t() %>%
prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pcaRUVg)$importance %>% pander(split.tables = Inf)
| Â | PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 |
|---|---|---|---|---|---|---|---|---|---|
| Standard deviation | 20.11 | 18.01 | 16.48 | 15.75 | 14.34 | 13.36 | 12.55 | 11.28 | 7.413e-14 |
| Proportion of Variance | 0.2109 | 0.1691 | 0.1417 | 0.1294 | 0.1073 | 0.09307 | 0.0822 | 0.06639 | 0 |
| Cumulative Proportion | 0.2109 | 0.38 | 0.5217 | 0.6511 | 0.7583 | 0.8514 | 0.9336 | 1 | 1 |
# Plot PCA
pcaRUVg$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, PC1, PC2) %>%
left_join(rownames_to_column(dgeFalse$samples, "sample")) %>%
ggplot(aes(PC1, PC2, colour = group, label = sample)) +
geom_point() +
geom_text_repel(show.legend = FALSE) +
theme_bw()
# Create copy of DGE List
dgeRUVg <- dgeFilt
# Bind W_1 from RUVSeq analysis to $samples
dgeRUVg$samples %<>% cbind(RUVg$W)
# Create design matrix
design <- model.matrix(~group + W_1, data = dgeRUVg$samples)
# Perform DE analysis
topTableRUVg <- estimateGLMCommonDisp(dgeRUVg, design) %>%
estimateGLMTagwiseDisp(design) %>%
glmFit(design) %>%
glmLRT(coef=2) %>%
topTags(n = Inf) %>%
.$table %>%
as_tibble() %>%
unite("Range", ID.start, ID.end, sep = "-") %>%
unite("Location", ID.seqnames, Range, ID.strand, sep = ":") %>%
dplyr::select(
Geneid = ID.gene_id,
Symbol = ID.gene_name,
AveExpr = logCPM, logFC,
P.Value = PValue,
FDR, Location,
Entrez = ID.entrezid
) %>%
mutate(DE = FDR < 0.05)
# Summary of DE genes
topTableRUVg %>%
dplyr::filter(FDR < 0.05) %>%
dplyr::select(Geneid, Symbol, AveExpr, logFC, P.Value, FDR) %>%
pander(style = "rmarkdown", split.tables = Inf)
| Geneid | Symbol | AveExpr | logFC | P.Value | FDR |
|---|---|---|---|---|---|
| ENSDARG00000091368 | AL954327.1 | 2.655 | -5.873 | 1.759e-12 | 3.229e-08 |
| ENSDARG00000093214 | si:ch211-284e13.9 | 0.8192 | 1.567 | 4.056e-10 | 3.722e-06 |
| ENSDARG00000017246 | prx | 2.325 | -2.652 | 1.009e-09 | 6.175e-06 |
| ENSDARG00000087732 | RF00017 | 2.174 | 1.108 | 9.402e-09 | 4.314e-05 |
| ENSDARG00000058342 | grhl2a | 2.302 | 0.9018 | 3.524e-08 | 0.0001287 |
| ENSDARG00000024789 | mxc | 1.82 | 2.513 | 4.208e-08 | 0.0001287 |
| ENSDARG00000091916 | ugt5b4 | -0.03816 | -1.437 | 6.489e-08 | 0.0001668 |
| ENSDARG00000036304 | dnaaf3l | 1.358 | -1.439 | 7.269e-08 | 0.0001668 |
| ENSDARG00000069961 | il21r.1 | 2.169 | 0.7942 | 2.494e-07 | 0.0005085 |
| ENSDARG00000042577 | batf3 | 0.437 | 1.155 | 6.085e-07 | 0.00103 |
| ENSDARG00000096829 | blvrb | 3.025 | -1.521 | 6.175e-07 | 0.00103 |
| ENSDARG00000044129 | si:ch211-214p16.1 | 3.486 | 0.8206 | 1.357e-06 | 0.002076 |
| ENSDARG00000061717 | cytip | 0.24 | 1.121 | 2.899e-06 | 0.003979 |
| ENSDARG00000041923 | ccl38.6 | 1.874 | 1.499 | 3.035e-06 | 0.003979 |
| ENSDARG00000021794 | hmmr | 1.117 | 0.8409 | 4.282e-06 | 0.005094 |
| ENSDARG00000097551 | si:dkey-7i4.15 | 2.126 | 0.9011 | 4.441e-06 | 0.005094 |
| ENSDARG00000101112 | FO704777.1 | 1.483 | -0.9218 | 4.769e-06 | 0.005149 |
| ENSDARG00000002768 | pvalb2 | 2.238 | -3.284 | 5.273e-06 | 0.005376 |
| ENSDARG00000074212 | SLC5A10 | 0.5022 | 1.075 | 5.575e-06 | 0.005385 |
| ENSDARG00000071499 | cxcl32b.1 | 0.3599 | 1.442 | 6.655e-06 | 0.005978 |
| ENSDARG00000041917 | ccl38a.4 | 0.5656 | 2.237 | 6.84e-06 | 0.005978 |
| ENSDARG00000024877 | ptgr1 | 4.625 | 2.336 | 7.44e-06 | 0.006004 |
| ENSDARG00000103456 | cyp2aa11 | 1.456 | -0.8222 | 7.524e-06 | 0.006004 |
| ENSDARG00000089477 | si:ch211-132g1.3 | 5.855 | 0.6167 | 7.862e-06 | 0.006012 |
| ENSDARG00000037378 | qdprb1 | 3.965 | 1.033 | 9.868e-06 | 0.007117 |
| ENSDARG00000071437 | ptprc | 4.363 | 0.7584 | 1.027e-05 | 0.007117 |
| ENSDARG00000086998 | zgc:64002 | 1.376 | 0.7935 | 1.047e-05 | 0.007117 |
| ENSDARG00000116047 | CU459094.3 | 1.674 | 1.612 | 1.093e-05 | 0.007166 |
| ENSDARG00000089382 | zgc:158463 | 5.632 | 0.6575 | 1.285e-05 | 0.008135 |
| ENSDARG00000091548 | stard9 | 3.837 | -0.8309 | 1.432e-05 | 0.008758 |
| ENSDARG00000105263 | ccl36.1 | 2.253 | 1.51 | 1.588e-05 | 0.009399 |
| ENSDARG00000037421 | egr1 | 8.527 | -0.7054 | 1.672e-05 | 0.009591 |
| ENSDARG00000062817 | crym | 1.656 | 1.335 | 2.161e-05 | 0.01202 |
| ENSDARG00000017624 | krt4 | 3.777 | -2.333 | 2.399e-05 | 0.01263 |
| ENSDARG00000074989 | sparcl1 | 2.651 | 0.7409 | 2.436e-05 | 0.01263 |
| ENSDARG00000038668 | gbp1 | 2.923 | 1.433 | 2.478e-05 | 0.01263 |
| ENSDARG00000068457 | tnnt3b | 1.893 | -2.269 | 2.583e-05 | 0.01281 |
| ENSDARG00000102558 | pde6ha | 2.218 | 1.489 | 3.64e-05 | 0.01758 |
| ENSDARG00000111436 | CR933528.1 | 1.123 | -1.54 | 3.878e-05 | 0.01825 |
| ENSDARG00000014657 | zgc:171731 | 0.9471 | 0.8961 | 4.08e-05 | 0.01872 |
| ENSDARG00000036189 | spata4 | 1.13 | 0.9143 | 4.398e-05 | 0.01969 |
| ENSDARG00000099197 | actc1b | 3.643 | -2.355 | 6.207e-05 | 0.02712 |
| ENSDARG00000035327 | ckma | 2.35 | -2.719 | 7.182e-05 | 0.03048 |
| ENSDARG00000058774 | CABZ01053221.1 | 0.1135 | 0.973 | 7.306e-05 | 0.03048 |
| ENSDARG00000101767 | si:dkey-183i3.6 | 0.4311 | 1.2 | 7.581e-05 | 0.03092 |
| ENSDARG00000067990 | myhz1.1 | 2.622 | -2.821 | 8.088e-05 | 0.03227 |
| ENSDARG00000092136 | cenpw | 0.7333 | 0.7699 | 8.406e-05 | 0.03233 |
| ENSDARG00000086418 | si:ch211-236p5.3 | 4.007 | -0.8025 | 8.798e-05 | 0.03233 |
| ENSDARG00000077799 | egr4 | 6.761 | -0.9616 | 8.886e-05 | 0.03233 |
| ENSDARG00000093438 | CU467110.1 | 4.596 | 0.5512 | 8.893e-05 | 0.03233 |
| ENSDARG00000101495 | ugt5b2 | 0.7437 | -1.399 | 9.262e-05 | 0.03233 |
| ENSDARG00000098293 | si:dkey-27i16.2 | 4.548 | 0.5004 | 9.454e-05 | 0.03233 |
| ENSDARG00000070579 | ggact.3 | 2.671 | -6.26 | 9.48e-05 | 0.03233 |
| ENSDARG00000096541 | BX901942.1 | 0.815 | 0.9853 | 9.512e-05 | 0.03233 |
| ENSDARG00000106991 | CR388373.3 | 1.651 | -0.7474 | 9.743e-05 | 0.03251 |
| ENSDARG00000114910 | FO704772.1 | 6.139 | 0.5599 | 0.0001005 | 0.03295 |
| ENSDARG00000086223 | si:ch73-144d13.4 | 1.873 | -0.7772 | 0.0001138 | 0.03665 |
| ENSDARG00000091627 | si:dkey-271j15.3 | 0.2788 | -1.479 | 0.0001203 | 0.03734 |
| ENSDARG00000013524 | cyp2ae1 | 0.3343 | -0.9528 | 0.0001219 | 0.03734 |
| ENSDARG00000097234 | AL929435.1 | 0.4823 | -0.9944 | 0.0001221 | 0.03734 |
| ENSDARG00000077474 | pla2r1 | 2.495 | -0.6125 | 0.0001244 | 0.03743 |
| ENSDARG00000017653 | rgs13 | 1.278 | 0.8592 | 0.0001297 | 0.03841 |
| ENSDARG00000028507 | itgb4 | 4.24 | -0.4763 | 0.0001351 | 0.03919 |
| ENSDARG00000092062 | BX511123.2 | 1.631 | 0.8145 | 0.0001374 | 0.03919 |
| ENSDARG00000089004 | si:ch211-207e14.4 | 2.794 | -0.6261 | 0.0001388 | 0.03919 |
| ENSDARG00000002830 | trmt2a | 3.068 | 0.5274 | 0.0001514 | 0.04211 |
| ENSDARG00000073952 | slc4a7 | 4.765 | -0.5462 | 0.0001577 | 0.04321 |
| ENSDARG00000095048 | si:dkey-250k15.7 | 1.212 | -2.731 | 0.0001611 | 0.04349 |
| ENSDARG00000044775 | fut7 | 0.6278 | 0.915 | 0.0001664 | 0.04426 |
| ENSDARG00000061006 | cyb5d2 | 2.749 | 1.627 | 0.0001704 | 0.04462 |
| ENSDARG00000042350 | syt1b | 2.538 | -0.701 | 0.0001726 | 0.04462 |
| ENSDARG00000040565 | ckmb | 2.633 | -2.686 | 0.0001879 | 0.04789 |
| ENSDARG00000094984 | si:ch211-197g15.8 | 1.448 | 0.7479 | 0.0001936 | 0.04867 |
| ENSDARG00000041864 | capn3a | 0.1472 | -0.993 | 0.0001978 | 0.04906 |
| ENSDARG00000010729 | CABZ01073795.1 | 3.718 | 1.727 | 0.0002005 | 0.04906 |
| ENSDARG00000055752 | npas4a | 7.974 | -0.8624 | 0.0002053 | 0.0492 |
| ENSDARG00000110160 | SBSPON | 0.9213 | -0.8867 | 0.0002064 | 0.0492 |
kalCounts <- read_rds("nhiData/6month_Normoxia.rds")
dgeListNhi <- kalCounts$counts %>%
as.data.frame() %>%
rownames_to_column("tx_id") %>%
as_tibble() %>%
set_colnames(basename(colnames(.))) %>%
mutate(tx_id = str_remove_all(tx_id, "\\.[0-9]*$")) %>%
left_join(trans2Gene) %>%
group_by(gene_id) %>%
summarise_if(
.predicate = is.numeric,
.funs = sum
) %>%
as.data.frame() %>%
column_to_rownames("gene_id") %>%
round() %>%
set_colnames(str_remove(colnames(.), "[0-9]_MORGAN_")) %>%
set_colnames(str_remove(colnames(.), "PN")) %>%
set_colnames(str_remove(colnames(.), "_[RS].+")) %>%
set_colnames(str_replace(colnames(.), "6P", "6W")) %>%
DGEList(
group = colnames(.) %>%
str_replace_all("(6)([WQ])(.+)", "\\2") %>%
str_replace_all("W", "WT") %>%
str_replace_all("Q", "Mut") %>%
factor(levels = c("WT", "Mut")),
genes = genesGR[rownames(.)]
) %>%
calcNormFactors()
# Perform logical test to see how many genes were not detected in dataset
dgeListNhi$counts %>%
rowSums() %>%
is_greater_than(0) %>%
table()
## .
## FALSE TRUE
## 1282 24299
# Check for genes having >= 4 samples with cpm > 1
dgeListNhi %>%
cpm() %>%
is_greater_than(1) %>%
rowSums() %>%
is_weakly_greater_than(4) %>%
table()
## .
## FALSE TRUE
## 7049 18532
# Create logical vector of genes to keep that fit criteria
genes2keepNhi <- dgeListNhi %>%
cpm() %>%
is_greater_than(1) %>%
rowSums() %>%
is_weakly_greater_than(4)
# Create new DGEList of genes fitting criteria
dgeFiltNhi <- dgeListNhi[genes2keepNhi,, keep.lib.sizes = FALSE] %>%
calcNormFactors()
# Compare distributions of the DGELists before and after filtering
par(mfrow = c(1,2))
dgeListNhi %>%
cpm(log = TRUE) %>%
plotDensities(legend = FALSE, main = "Before Filtering")
dgeFiltNhi %>%
cpm(log = TRUE) %>%
plotDensities(legend = FALSE, main = "After Filtering")
par(mfrow = c(1,1))
# Check library sizes with box plot
dgeFiltNhi$samples %>%
ggplot(aes(group, lib.size, fill = group)) +
geom_boxplot() +
scale_y_continuous(labels = comma) +
labs(x = "Genotype", y = "Library Size") +
scale_fill_discrete(
name ="Genotype",
labels = c("Wildtype","Mutant")
) +
scale_x_discrete(labels=c("w" = "Wildtype", "q" = "Mutant")) +
theme_bw()
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
pcaNhi <- dgeFiltNhi %>%
cpm(log = TRUE) %>%
t() %>%
prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pcaNhi)$importance %>% pander(split.tables = Inf)
| Â | PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 |
|---|---|---|---|---|---|---|---|---|
| Standard deviation | 18.02 | 16.09 | 15.83 | 14.74 | 14.21 | 13.33 | 13.09 | 4.364e-14 |
| Proportion of Variance | 0.2026 | 0.1616 | 0.1565 | 0.1356 | 0.1261 | 0.1108 | 0.1069 | 0 |
| Cumulative Proportion | 0.2026 | 0.3642 | 0.5206 | 0.6562 | 0.7823 | 0.8931 | 1 | 1 |
# Plot PCA
pcaNhi$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, PC1, PC2) %>%
left_join(rownames_to_column(dgeFiltNhi$samples, "sample")) %>%
ggplot(aes(PC1, PC2, colour = group, label = sample)) +
geom_point() +
geom_text_repel(show.legend = FALSE) +
theme_bw()
# Create model matrix
design <- model.matrix(~group, data = dgeFiltNhi$samples)
# Perform exact test on DGEList
topTableNhi <- dgeFiltNhi %>%
estimateDisp(design = design) %>%
exactTest() %>%
topTags(n = Inf) %>%
.$table %>%
as_tibble() %>%
unite("Range", start, end, sep = "-") %>%
unite("Location", seqnames, Range, strand, sep = ":") %>%
dplyr::select(Geneid = gene_id,
Symbol = gene_name,
AveExpr = logCPM, logFC,
P.Value = PValue,
FDR, Location,
Entrez = entrezid) %>%
mutate(DE = FDR < 0.05)
# Volcano plot showing DE genes
topTableNhi %>%
ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
geom_point(alpha = 0.5) +
geom_text_repel(data = . %>%
dplyr::filter(DE) %>%
dplyr::filter(-log10(P.Value) > 4 | abs(logFC) > 2.5), aes(label = Symbol)) +
scale_colour_manual(values = c("grey", "red")) +
theme_bw() +
theme(legend.position = "none")
# MD Plot showing DE genes
topTableNhi %>%
dplyr::arrange(desc(P.Value)) %>%
ggplot(aes(AveExpr, logFC, colour = DE)) +
geom_point(alpha = 0.5) +
geom_text_repel(data = . %>%
dplyr::filter(DE) %>%
dplyr::filter(abs(logFC) > 2 | AveExpr > 14),
aes(label = Symbol)) +
scale_colour_manual(values = c("grey", "red")) +
labs(x = "Average Expression (log2 CPM)",
y = "log Fold-Change") +
theme_bw() +
theme(legend.position = "none")
# Summary of DE genes
topTableDENhi <- topTableNhi %>%
dplyr::filter(FDR < 0.05) %>%
dplyr::select(Geneid, Symbol, AveExpr, logFC, P.Value, FDR)
topTableDENhi %>% pander(style = "rmarkdown", split.tables = Inf)
| Geneid | Symbol | AveExpr | logFC | P.Value | FDR |
|---|---|---|---|---|---|
| ENSDARG00000087290 | si:ch211-202h22.10 | 0.6292 | 5.598 | 9.788e-23 | 1.814e-18 |
| ENSDARG00000077567 | angel1 | 3.237 | -0.7363 | 7.88e-09 | 7.301e-05 |
| ENSDARG00000058270 | COLGALT1 (1 of many) | 5.742 | 0.5835 | 2.138e-07 | 0.001182 |
| ENSDARG00000102796 | si:ch1073-83b15.1 | 1.702 | 1.163 | 2.612e-07 | 0.001182 |
| ENSDARG00000101407 | tgm1l4 | 0.205 | -1.664 | 3.188e-07 | 0.001182 |
| ENSDARG00000092604 | si:ch211-15j1.4 | 4.745 | 1.068 | 2.306e-06 | 0.007022 |
| ENSDARG00000074266 | si:ch1073-314i13.4 | 6.159 | -0.437 | 3.105e-06 | 0.007022 |
| ENSDARG00000103727 | tfr2 | 0.8383 | -1.198 | 3.348e-06 | 0.007022 |
| ENSDARG00000035871 | rpl30 | 6.328 | -0.295 | 3.41e-06 | 0.007022 |
| ENSDARG00000098688 | CABZ01101739.1 | 3.57 | 0.5685 | 6.214e-06 | 0.01152 |
| ENSDARG00000045248 | h3f3d | 6.362 | -0.3339 | 1.18e-05 | 0.01988 |
| ENSDARG00000104839 | mansc1 | 5.318 | 0.4417 | 1.694e-05 | 0.02616 |
| ENSDARG00000075759 | samd4a | 6.597 | 0.3786 | 2.113e-05 | 0.02805 |
| ENSDARG00000091209 | ucp3 | 0.5011 | -1.314 | 2.119e-05 | 0.02805 |
| ENSDARG00000094408 | MFAP4 (1 of many) | 0.1877 | -1.82 | 2.519e-05 | 0.03112 |
| ENSDARG00000032005 | ccdc65 | 2.008 | 0.7997 | 3.23e-05 | 0.03741 |
| ENSDARG00000096375 | march7 | 6.999 | 0.3083 | 3.59e-05 | 0.03913 |
| ENSDARG00000024847 | col5a2b | 1.45 | -0.9456 | 4.18e-05 | 0.04112 |
| ENSDARG00000092807 | RPL41 | 6.716 | -0.2908 | 4.215e-05 | 0.04112 |
| ENSDARG00000091025 | znf1124 | 0.816 | 1.289 | 5.094e-05 | 0.0472 |
# Create named vector of gene level statistics
ranksNhi <- topTableNhi %>%
mutate(stat = -sign(logFC) * log10(P.Value)) %>%
dplyr::arrange(desc(stat)) %>%
with(structure(stat, names = Geneid))
# Set seed to enable reproducibility
set.seed(22)
# Run GSEA for hallmark
fgseaHallmarkNhi <- fgsea(hallmark, ranksNhi, nperm=1e5) %>%
as_tibble() %>%
dplyr::rename(FDR = padj) %>%
mutate(padj = p.adjust(pval, "bonferroni")) %>%
dplyr::arrange(pval)
# Create an object of pathways with adjusted p-value < 0.05 for construction of network diagrams. This should be done differently next time, but too much work has been done to change it now.
fgseaHallmarkNhiTop <- fgseaHallmarkNhi %>%
dplyr::filter(padj < 0.05)
fgseaHallmarkNhiTop %>%
dplyr::select(-leadingEdge, -nMoreExtreme) %>%
pander(
style = "rmarkdown",
split.tables = Inf,
justify = "lrrrrrr",
caption = paste(
"The", nrow(.), "most significantly enriched Hallmark pathways.",
"This corresponds to an FDR of", percent(max(.$FDR)))
)
| pathway | pval | FDR | ES | NES | size | padj |
|---|---|---|---|---|---|---|
| OXIDATIVE_PHOSPHORYLATION | 2.011e-05 | 0.001006 | -0.4286 | -1.738 | 222 | 0.001006 |
| MTORC1_SIGNALING | 0.0001007 | 0.002517 | -0.4053 | -1.648 | 229 | 0.005035 |
| MYC_TARGETS_V1 | 0.0005627 | 0.007796 | -0.3841 | -1.557 | 221 | 0.02813 |
| XENOBIOTIC_METABOLISM | 0.0006237 | 0.007796 | -0.3829 | -1.553 | 223 | 0.03118 |
# Make a table plot of significant Hallmark pathways
if (interactive()) grid::grid.newpage()
plotGseaTable(hallmark[fgseaHallmarkNhiTop$pathway],
ranksNhi, fgseaHallmarkNhi, gseaParam = 0.5)
# Set seed to enable reproducibility
set.seed(22)
# Run GSEA for hallmark
fgseaKEGGNhi <- fgsea(kegg, ranksNhi, nperm=1e5) %>%
as_tibble() %>%
dplyr::rename(FDR = padj) %>%
mutate(padj = p.adjust(pval, "bonferroni")) %>%
dplyr::arrange(pval)
# Create an object of pathways with adjusted p-value < 0.05 for construction of network diagrams. This should be done differently next time, but too much work has been done to change it now.
fgseaKEGGNhiTop <- fgseaKEGGNhi %>%
dplyr::filter(padj < 0.05)
fgseaKEGGNhiTop %>%
dplyr::select(-leadingEdge, -nMoreExtreme) %>%
pander(
style = "rmarkdown",
split.tables = Inf,
justify = "lrrrrrr",
caption = paste(
"The", nrow(.), "most significantly enriched KEGG pathways.",
"This corresponds to an FDR of", percent(max(.$FDR)))
)
| pathway | pval | FDR | ES | NES | size | padj |
|---|---|---|---|---|---|---|
| FATTY_ACID_METABOLISM | 1.985e-05 | 0.0005352 | -0.705 | -2.447 | 73 | 0.003692 |
| TYROSINE_METABOLISM | 1.986e-05 | 0.0005352 | -0.7169 | -2.447 | 66 | 0.003694 |
| RIBOSOME | 1.988e-05 | 0.0005352 | -0.8126 | -2.87 | 81 | 0.003698 |
| GLYCOLYSIS_GLUCONEOGENESIS | 1.993e-05 | 0.0005352 | -0.6286 | -2.271 | 94 | 0.003708 |
| METABOLISM_OF_XENOBIOTICS_BY_CYTOCHROME_P450 | 2.014e-05 | 0.0005352 | -0.4192 | -1.738 | 274 | 0.003745 |
| DRUG_METABOLISM_CYTOCHROME_P450 | 2.014e-05 | 0.0005352 | -0.4515 | -1.867 | 268 | 0.003746 |
| RETINOL_METABOLISM | 2.014e-05 | 0.0005352 | -0.4822 | -1.997 | 273 | 0.003746 |
| PEROXISOME | 3.973e-05 | 0.0009238 | -0.5632 | -1.981 | 79 | 0.00739 |
# Make a table plot of significant Hallmark pathways
if (interactive()) grid::grid.newpage()
plotGseaTable(kegg[fgseaKEGGNhiTop$pathway],
ranksNhi, fgseaKEGGNhi, gseaParam = 0.5)
# Set seed to enable reproducibility
set.seed(22)
# Run GSEA for hallmark
fgseaWikiNhi <- fgsea(wiki, ranksNhi, nperm=1e5) %>%
as_tibble() %>%
dplyr::rename(FDR = padj) %>%
mutate(padj = p.adjust(pval, "bonferroni")) %>%
dplyr::arrange(pval)
# Create an object of pathways with adjusted p-value < 0.05 for construction of network diagrams. This should be done differently next time, but too much work has been done to change it now.
fgseaWikiNhiTop <- fgseaWikiNhi %>%
dplyr::filter(padj < 0.05)
fgseaWikiNhiTop %>%
dplyr::select(-leadingEdge, -nMoreExtreme) %>%
pander(
style = "rmarkdown",
split.tables = Inf,
justify = "lrrrrrr",
caption = paste(
"The", nrow(.), "most significantly enriched Wiki pathways.",
"This corresponds to an FDR of", percent(max(.$FDR)))
)
| pathway | pval | FDR | ES | NES | size | padj |
|---|---|---|---|---|---|---|
| Cytoplasmic Ribosomal Proteins | 1.995e-05 | 0.003503 | -0.8051 | -2.838 | 81 | 0.01045 |
| Fatty Acid Omega Oxidation | 2.003e-05 | 0.003503 | -0.8832 | -2.609 | 32 | 0.0105 |
| Ethanol effects on histone modifications | 2.006e-05 | 0.003503 | -0.6779 | -2.088 | 39 | 0.01051 |
# Make a table plot of significant Hallmark pathways
if (interactive()) grid::grid.newpage()
plotGseaTable(wiki[fgseaWikiNhiTop$pathway],
ranksNhi, fgseaWikiNhi, gseaParam = 0.5)
# Create design matrix
design <- model.matrix(~group, data = dgeFiltNhi$samples)
# Create voom object
voomNhi <- voom(dgeFiltNhi, design)
# Create index for fry
fryIndexNhi <- ids2indices(hallmark, rownames(dgeFiltNhi$counts))
# Run fry
fry(voomNhi, index = fryIndexNhi, design = design) %>%
rownames_to_column("pathway") %>%
dplyr::arrange(PValue.Mixed) %>%
as.data.frame() %>%
column_to_rownames("pathway")
## NGenes Direction PValue FDR
## PROTEIN_SECRETION 102 Down 0.27959270 0.7698557
## MYC_TARGETS_V1 221 Down 0.23241706 0.7698557
## G2M_CHECKPOINT 213 Down 0.36205530 0.7698557
## E2F_TARGETS 191 Down 0.40032495 0.7698557
## TGF_BETA_SIGNALING 58 Up 0.32978673 0.7698557
## ADIPOGENESIS 222 Down 0.46538792 0.8023930
## APICAL_SURFACE 51 Up 0.23393729 0.7698557
## PI3K_AKT_MTOR_SIGNALING 118 Up 0.35321941 0.7698557
## DNA_REPAIR 153 Down 0.68331034 0.8565908
## APICAL_JUNCTION 223 Up 0.01266432 0.6332160
## NOTCH_SIGNALING 38 Down 0.68527268 0.8565908
## OXIDATIVE_PHOSPHORYLATION 222 Down 0.38622964 0.7698557
## SPERMATOGENESIS 121 Down 0.82759676 0.9373399
## MITOTIC_SPINDLE 245 Up 0.08688538 0.7698557
## IL6_JAK_STAT3_SIGNALING 68 Up 0.28792175 0.7698557
## ALLOGRAFT_REJECTION 167 Up 0.67358688 0.8565908
## ESTROGEN_RESPONSE_EARLY 195 Down 0.88926360 0.9460251
## UNFOLDED_PROTEIN_RESPONSE 122 Down 0.25519731 0.7698557
## HEME_METABOLISM 190 Down 0.22797117 0.7698557
## UV_RESPONSE_DN 169 Up 0.30017080 0.7698557
## KRAS_SIGNALING_UP 209 Up 0.31130379 0.7698557
## XENOBIOTIC_METABOLISM 207 Down 0.18859709 0.7698557
## BILE_ACID_METABOLISM 112 Down 0.34015629 0.7698557
## ESTROGEN_RESPONSE_LATE 197 Down 0.93105626 0.9698503
## HEDGEHOG_SIGNALING 44 Down 0.96349754 0.9831608
## P53_PATHWAY 215 Down 0.39386274 0.7698557
## FATTY_ACID_METABOLISM 185 Down 0.17003582 0.7698557
## INFLAMMATORY_RESPONSE 167 Up 0.67009640 0.8565908
## GLYCOLYSIS 214 Down 0.81643292 0.9373399
## COAGULATION 118 Up 0.10918373 0.7698557
## HYPOXIA 212 Down 0.63447176 0.8565908
## MTORC1_SIGNALING 226 Down 0.35491197 0.7698557
## KRAS_SIGNALING_DN 162 Up 0.88187853 0.9460251
## WNT_BETA_CATENIN_SIGNALING 49 Down 0.76951895 0.9160940
## INTERFERON_GAMMA_RESPONSE 173 Up 0.63091627 0.8565908
## APOPTOSIS 156 Up 0.15107937 0.7698557
## CHOLESTEROL_HOMEOSTASIS 81 Down 0.50552628 0.8425438
## IL2_STAT5_SIGNALING 209 Up 0.60123594 0.8565908
## REACTIVE_OXIGEN_SPECIES_PATHWAY 48 Up 0.63071572 0.8565908
## TNFA_SIGNALING_VIA_NFKB 189 Down 0.73737788 0.8992413
## PEROXISOME 119 Down 0.05436146 0.7698557
## ANDROGEN_RESPONSE 107 Up 0.60672070 0.8565908
## EPITHELIAL_MESENCHYMAL_TRANSITION 219 Up 0.03232789 0.7698557
## COMPLEMENT 201 Up 0.11612406 0.7698557
## ANGIOGENESIS 33 Up 0.45509598 0.8023930
## MYOGENESIS 224 Up 0.18156374 0.7698557
## INTERFERON_ALPHA_RESPONSE 78 Up 0.65803997 0.8565908
## MYC_TARGETS_V2 56 Down 0.42760157 0.7918548
## UV_RESPONSE_UP 176 Down 0.84360590 0.9373399
## PANCREAS_BETA_CELLS 37 Up 0.98848288 0.9884829
## PValue.Mixed FDR.Mixed
## PROTEIN_SECRETION 0.01357294 0.5583283
## MYC_TARGETS_V1 0.02476806 0.5583283
## G2M_CHECKPOINT 0.04885238 0.5583283
## E2F_TARGETS 0.04972041 0.5583283
## TGF_BETA_SIGNALING 0.07347099 0.5583283
## ADIPOGENESIS 0.12210669 0.5583283
## APICAL_SURFACE 0.13185471 0.5583283
## PI3K_AKT_MTOR_SIGNALING 0.13590464 0.5583283
## DNA_REPAIR 0.13717787 0.5583283
## APICAL_JUNCTION 0.15484389 0.5583283
## NOTCH_SIGNALING 0.17439845 0.5583283
## OXIDATIVE_PHOSPHORYLATION 0.18099078 0.5583283
## SPERMATOGENESIS 0.18123531 0.5583283
## MITOTIC_SPINDLE 0.20717829 0.5583283
## IL6_JAK_STAT3_SIGNALING 0.21778096 0.5583283
## ALLOGRAFT_REJECTION 0.23168133 0.5583283
## ESTROGEN_RESPONSE_EARLY 0.23312471 0.5583283
## UNFOLDED_PROTEIN_RESPONSE 0.23509505 0.5583283
## HEME_METABOLISM 0.23868863 0.5583283
## UV_RESPONSE_DN 0.23947239 0.5583283
## KRAS_SIGNALING_UP 0.25771085 0.5583283
## XENOBIOTIC_METABOLISM 0.27129365 0.5583283
## BILE_ACID_METABOLISM 0.27955407 0.5583283
## ESTROGEN_RESPONSE_LATE 0.28199324 0.5583283
## HEDGEHOG_SIGNALING 0.30171490 0.5583283
## P53_PATHWAY 0.30618682 0.5583283
## FATTY_ACID_METABOLISM 0.30897382 0.5583283
## INFLAMMATORY_RESPONSE 0.32117220 0.5583283
## GLYCOLYSIS 0.32383041 0.5583283
## COAGULATION 0.33624577 0.5604096
## HYPOXIA 0.36494618 0.5633975
## MTORC1_SIGNALING 0.36898233 0.5633975
## KRAS_SIGNALING_DN 0.37184233 0.5633975
## WNT_BETA_CATENIN_SIGNALING 0.39911208 0.5826677
## INTERFERON_GAMMA_RESPONSE 0.41347175 0.5826677
## APOPTOSIS 0.42378326 0.5826677
## CHOLESTEROL_HOMEOSTASIS 0.43117409 0.5826677
## IL2_STAT5_SIGNALING 0.44758553 0.5877477
## REACTIVE_OXIGEN_SPECIES_PATHWAY 0.45844323 0.5877477
## TNFA_SIGNALING_VIA_NFKB 0.47226405 0.5903301
## PEROXISOME 0.51387585 0.6266779
## ANDROGEN_RESPONSE 0.56673227 0.6489895
## EPITHELIAL_MESENCHYMAL_TRANSITION 0.57979775 0.6489895
## COMPLEMENT 0.58199338 0.6489895
## ANGIOGENESIS 0.58409057 0.6489895
## MYOGENESIS 0.61842950 0.6722060
## INTERFERON_ALPHA_RESPONSE 0.64360126 0.6846822
## MYC_TARGETS_V2 0.66597517 0.6937241
## UV_RESPONSE_UP 0.68112589 0.6950264
## PANCREAS_BETA_CELLS 0.85490624 0.8549062
# Load significant pathways with ONLY leading edge genes determined from GSEA analysis
sigHallmarkNhi <-
fgseaHallmarkNhiTop %>%
split(f = .$pathway) %>%
lapply(extract2, "leadingEdge") %>%
lapply(unlist)
# Create a node list
pathwaysHallmarkNhi <- names(sigHallmarkNhi) %>%
as.data.frame() %>%
set_colnames("label") %>%
mutate(label = as.character(label))
genesHallmarkNhi <- unique(unlist(sigHallmarkNhi)) %>%
as.data.frame() %>%
set_colnames("label") %>%
mutate(label = as.character(label))
nodesHallmarkNhi <- full_join(pathwaysHallmarkNhi, genesHallmarkNhi, by = "label") %>%
rowid_to_column("id")
# Then create an edge list
edgesHallmarkNhi <- ldply(sigHallmarkNhi, data.frame) %>%
set_colnames(c("pathway", "gene")) %>%
mutate(gene = as.character(gene)) %>%
left_join(nodesHallmarkNhi, by = c("pathway" = "label")) %>%
dplyr::rename(from = id) %>%
left_join(nodesHallmarkNhi, by = c("gene" = "label")) %>%
dplyr::rename(to = id) %>%
dplyr::select(from, to)
# Create tidygraph object
tidyHallmarkNhi <-
tbl_graph(nodes = nodesHallmarkNhi,
edges = edgesHallmarkNhi,
directed = FALSE) %>%
activate(nodes) %>%
left_join(idConvertSymbol, by = "label") %>%
mutate(
pathways = case_when(
id <= nrow(fgseaHallmarkNhiTop) ~ label
),
DE = case_when(
label %in% topTableDENhi$Geneid ~ symbol
),
size = case_when(
label %in% topTableNhi$Geneid ~
as.integer(row_number(label %in% topTableNhi$Geneid)),
id <= nrow(fgseaHallmarkNhiTop) ~ as.integer(4000)
),
colour = case_when(
id <= nrow(fgseaHallmarkNhiTop) ~ rainbow(nrow(fgseaHallmarkNhiTop))[id],
label %in% topTableDENhi$Geneid ~ "black"
)
) %>%
activate(edges) %>%
mutate(
colour = case_when(
from <= nrow(fgseaHallmarkNhiTop) ~
rainbow(nrow(fgseaHallmarkNhiTop))[from]
)
)
# Set seed to enable reproducibility (seed selected to create graph with non-overlapping labels)
set.seed(22)
# Plot graph
ggraph(tidyHallmarkNhi, layout = "fr") +
scale_fill_manual(
values = c(rainbow(nrow(fgseaHallmarkNhiTop)), "black"),
na.value = "gray80"
) +
geom_edge_arc(
aes(color = colour),
alpha = 0.5,
show.legend = FALSE,
curvature = 0.5
) +
geom_node_point(
aes(size = size, fill = colour),
shape = 21, stroke = 0.5,
show.legend = FALSE
) +
geom_node_label(
aes(label = pathways),
repel = TRUE, size = 3,
alpha = 0.7,
label.padding = 0.1
) +
geom_node_text(
aes(label = DE),
repel = TRUE,
size = 3,
alpha = 0.8,
colour = "black"
) +
theme_graph() +
theme(legend.position = "none")
# Load significant pathways with ONLY leading edge genes determined from GSEA analysis
sigKEGGNhi <-
fgseaKEGGNhiTop %>%
split(f = .$pathway) %>%
lapply(extract2, "leadingEdge") %>%
lapply(unlist)
# Create a node list
pathwaysKEGGNhi <- names(sigKEGGNhi) %>%
as.data.frame() %>%
set_colnames("label") %>%
mutate(label = as.character(label))
genesKEGGNhi <- unique(unlist(sigKEGGNhi)) %>%
as.data.frame() %>%
set_colnames("label") %>%
mutate(label = as.character(label))
nodesKEGGNhi <- full_join(pathwaysKEGGNhi, genesKEGGNhi, by = "label") %>%
rowid_to_column("id")
# Then create an edge list
edgesKEGGNhi <- ldply(sigKEGGNhi, data.frame) %>%
set_colnames(c("pathway", "gene")) %>%
mutate(gene = as.character(gene)) %>%
left_join(nodesKEGGNhi, by = c("pathway" = "label")) %>%
dplyr::rename(from = id) %>%
left_join(nodesKEGGNhi, by = c("gene" = "label")) %>%
dplyr::rename(to = id) %>%
dplyr::select(from, to)
# Create tidygraph object
tidyKEGGNhi <-
tbl_graph(nodes = nodesKEGGNhi, edges = edgesKEGGNhi, directed = FALSE) %>%
activate(nodes) %>%
left_join(idConvertSymbol, by = "label") %>%
mutate(
pathways = case_when(
id <= nrow(fgseaKEGGNhiTop) ~ label
),
DE = case_when(
label %in% topTableDENhi$Geneid ~ symbol
),
size = case_when(
label %in% topTableNhi$Geneid ~
as.integer(row_number(label %in% topTableNhi$Geneid)),
id <= nrow(fgseaKEGGNhiTop) ~ as.integer(4000)
),
colour = case_when(
id <= nrow(fgseaKEGGNhiTop) ~ rainbow(nrow(fgseaKEGGNhiTop))[id],
label %in% topTableDENhi$Geneid ~ "black"
)
) %>%
activate(edges) %>%
mutate(
colour = case_when(
from <= nrow(fgseaKEGGNhiTop) ~ rainbow(nrow(fgseaKEGGNhiTop))[from]
)
)
# Set seed to enable reproducibility (seed selected to create graph with non-overlapping labels)
set.seed(22)
# Plot graph
ggraph(tidyKEGGNhi, layout = "fr") +
scale_fill_manual(
values = c(rainbow(nrow(fgseaKEGGNhiTop)), "black"),
na.value = "gray80"
) +
geom_edge_arc(
aes(color = colour),
alpha = 0.5,
show.legend = FALSE,
curvature = 0.5
) +
geom_node_point(
aes(size = size, fill = colour),
shape = 21,
stroke = 0.5,
show.legend = FALSE
) +
geom_node_label(
aes(label = pathways),
repel = TRUE,
size = 3,
alpha = 0.7,
label.padding = 0.1
) +
geom_node_text(
aes(label = DE),
repel = TRUE,
size = 3,
alpha = 0.8,
colour = "black"
) +
theme_graph() +
theme(legend.position = "none")
# Load significant pathways with ONLY leading edge genes determined from GSEA analysis
sigWikiNhi <-
fgseaWikiNhiTop %>%
split(f = .$pathway) %>%
lapply(extract2, "leadingEdge") %>%
lapply(unlist)
# Create a node list
pathwaysWikiNhi <- names(sigWikiNhi) %>%
as.data.frame() %>%
set_colnames("label") %>%
mutate(label = as.character(label))
genesWikiNhi <- unique(unlist(sigWikiNhi)) %>%
as.data.frame() %>%
set_colnames("label") %>%
mutate(label = as.character(label))
nodesWikiNhi <- full_join(pathwaysWikiNhi, genesWikiNhi, by = "label") %>%
rowid_to_column("id")
# Then create an edge list
edgesWikiNhi <- ldply(sigWikiNhi, data.frame) %>%
set_colnames(c("pathway", "gene")) %>%
mutate(gene = as.character(gene)) %>%
left_join(nodesWikiNhi, by = c("pathway" = "label")) %>%
dplyr::rename(from = id) %>%
left_join(nodesWikiNhi, by = c("gene" = "label")) %>%
dplyr::rename(to = id) %>%
dplyr::select(from, to)
# Create tidygraph object
tidyWikiNhi <-
tbl_graph(nodes = nodesWikiNhi, edges = edgesWikiNhi, directed = FALSE) %>%
activate(nodes) %>%
left_join(idConvertSymbol, by = "label") %>%
mutate(
pathways = case_when(
id <= nrow(fgseaWikiNhiTop) ~ label
),
DE = case_when(
label %in% topTableDENhi$Geneid ~ symbol
),
size = case_when(
label %in% topTableNhi$Geneid ~
as.integer(row_number(label %in% topTableNhi$Geneid)),
id <= nrow(fgseaWikiNhiTop) ~ as.integer(4000)
),
colour = case_when(
id <= nrow(fgseaWikiNhiTop) ~ rainbow(nrow(fgseaWikiNhiTop))[id],
label %in% topTableDENhi$Geneid ~ "black"
)
) %>%
activate(edges) %>%
mutate(
colour = case_when(
from <= nrow(fgseaWikiNhiTop) ~ rainbow(nrow(fgseaWikiNhiTop))[from]
)
)
# Set seed to enable reproducibility (seed selected to create graph with non-overlapping labels)
set.seed(22)
# Plot graph
ggraph(tidyWikiNhi, layout = "fr") +
scale_fill_manual(
values = c(rainbow(nrow(fgseaWikiNhiTop)), "black"),
na.value = "gray80"
) +
geom_edge_arc(
aes(color = colour),
alpha = 0.5,
show.legend = FALSE,
curvature = 0.5
) +
geom_node_point(
aes(size = size, fill = colour),
shape = 21,
stroke = 0.5,
show.legend = FALSE
) +
geom_node_label(
aes(label = pathways),
repel = TRUE,
size = 3,
alpha = 0.7,
label.padding = 0.1
) +
geom_node_text(
aes(label = DE),
repel = TRUE,
size = 3,
alpha = 0.8,
colour = "black") +
theme_graph() +
theme(legend.position = "none")
# Extract subset of low expressed genes from DE analysis to act as negative controls for RUVg procedure
negControlNhi <- topTableNhi %>%
dplyr::arrange(desc(P.Value)) %>%
.[1:10000,] %>%
.$Geneid
# Run RUVSeq
RUVgNhi <- RUVg(dgeFiltNhi$counts, negControlNhi, 1)
# Create copy of dgeFilt as framework to replace RUVSeq results into
dgeFalseNhi <- dgeFiltNhi
# Replace with results
dgeFalse$counts <- RUVgNhi$normalizedCounts
# Run PCA function
pcaRUVgNhi <- dgeFalseNhi %>%
cpm(log = TRUE) %>%
t() %>%
prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pcaRUVgNhi)$importance %>% pander(split.tables = Inf)
| Â | PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 |
|---|---|---|---|---|---|---|---|---|
| Standard deviation | 18.02 | 16.09 | 15.83 | 14.74 | 14.21 | 13.33 | 13.09 | 4.364e-14 |
| Proportion of Variance | 0.2026 | 0.1616 | 0.1565 | 0.1356 | 0.1261 | 0.1108 | 0.1069 | 0 |
| Cumulative Proportion | 0.2026 | 0.3642 | 0.5206 | 0.6562 | 0.7823 | 0.8931 | 1 | 1 |
# Plot PCA
pcaRUVgNhi$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, PC1, PC2) %>%
left_join(rownames_to_column(dgeFalseNhi$samples, "sample")) %>%
ggplot(aes(PC1, PC2, colour = group, label = sample)) +
geom_point() +
geom_text_repel(show.legend = FALSE) +
theme_bw()
# Create copy of DGE List
dgeRUVgNhi <- dgeFiltNhi
# Bind W_1 from RUVSeq analysis to $samples
dgeRUVgNhi$samples %<>% cbind(RUVgNhi$W)
# Create design matrix
design <- model.matrix(~group + W_1, data = dgeRUVgNhi$samples)
# Perform DE analysis
topTableRUVgNhi <- estimateGLMCommonDisp(dgeRUVgNhi, design) %>%
estimateGLMTagwiseDisp(design) %>%
glmFit(design) %>%
glmLRT(coef=2) %>%
topTags(n = Inf) %>%
.$table %>%
as_tibble() %>%
unite("Range", start, end, sep = "-") %>%
unite("Location", seqnames, Range, strand, sep = ":") %>%
dplyr::select(
Geneid = gene_id,
Symbol = gene_name,
AveExpr = logCPM, logFC,
P.Value = PValue,
FDR, Location,
Entrez = entrezid
) %>%
mutate(DE = FDR < 0.05)
# Summary of DE genes
topTableRUVgNhi %>%
dplyr::filter(FDR < 0.05) %>%
dplyr::select(Geneid, Symbol, AveExpr, logFC, P.Value, FDR) %>%
pander(style = "rmarkdown", split.tables = Inf)
| Geneid | Symbol | AveExpr | logFC | P.Value | FDR |
|---|---|---|---|---|---|
| ENSDARG00000087290 | si:ch211-202h22.10 | 0.6301 | 4.881 | 9.265e-26 | 1.717e-21 |
| ENSDARG00000091253 | smyd1b | 0.6732 | 2.6 | 6.69e-13 | 6.199e-09 |
| ENSDARG00000102796 | si:ch1073-83b15.1 | 1.702 | 1.388 | 1.409e-09 | 8.703e-06 |
| ENSDARG00000094408 | MFAP4 (1 of many) | 0.1876 | -2.021 | 4.623e-09 | 2.142e-05 |
| ENSDARG00000103727 | tfr2 | 0.8382 | -1.383 | 6.576e-08 | 0.0002437 |
| ENSDARG00000077567 | angel1 | 3.237 | -0.7859 | 1.66e-07 | 0.0004883 |
| ENSDARG00000025320 | col21a1 | 0.126 | 1.912 | 1.844e-07 | 0.0004883 |
| ENSDARG00000075974 | si:dkey-258f14.3 | 2.847 | -1.485 | 2.153e-07 | 0.0004987 |
| ENSDARG00000101918 | si:ch211-111e20.1 | 3.081 | 5.155 | 3.972e-07 | 0.0008178 |
| ENSDARG00000101407 | tgm1l4 | 0.2047 | -1.65 | 5.591e-07 | 0.001036 |
| ENSDARG00000058270 | COLGALT1 (1 of many) | 5.742 | 0.6908 | 6.344e-07 | 0.001069 |
| ENSDARG00000061177 | mov10b.1 | 1.191 | 1.294 | 7.145e-07 | 0.001103 |
| ENSDARG00000078246 | si:ch211-114l13.4 | 2.125 | 1.157 | 7.745e-07 | 0.001104 |
| ENSDARG00000088745 | MFAP4 (1 of many) | 1.741 | 1.103 | 1.151e-06 | 0.001524 |
| ENSDARG00000104214 | CR381540.3 | 1.666 | -1.075 | 2.278e-06 | 0.002814 |
| ENSDARG00000091025 | znf1124 | 0.8163 | 1.463 | 2.515e-06 | 0.002913 |
| ENSDARG00000015337 | comta | 3.141 | -0.7437 | 3.274e-06 | 0.003447 |
| ENSDARG00000092604 | si:ch211-15j1.4 | 4.745 | 1.169 | 3.348e-06 | 0.003447 |
| ENSDARG00000091859 | si:ch73-233m11.2 | 1.447 | -1.292 | 4.1e-06 | 0.003999 |
| ENSDARG00000094297 | si:dkey-222h21.2 | 1.605 | 0.9738 | 8.059e-06 | 0.007467 |
| ENSDARG00000113348 | FO904868.1 | 2.218 | 1.226 | 9.293e-06 | 0.008201 |
| ENSDARG00000100132 | CU929402.1 | 3.621 | 0.6179 | 1.026e-05 | 0.008646 |
| ENSDARG00000041699 | piwil1 | 0.2241 | -1.306 | 1.122e-05 | 0.00904 |
| ENSDARG00000024847 | col5a2b | 1.45 | -1.015 | 1.199e-05 | 0.009261 |
| ENSDARG00000093531 | slc37a4b | 0.4143 | 2.907 | 1.363e-05 | 0.0101 |
| ENSDARG00000093484 | si:dkey-95p16.2 | 2.553 | -0.7538 | 1.531e-05 | 0.01091 |
| ENSDARG00000086374 | isg15 | 0.4439 | 1.47 | 1.791e-05 | 0.01229 |
| ENSDARG00000088740 | hmgcll1 | 2.723 | 1.068 | 2.093e-05 | 0.01385 |
| ENSDARG00000100698 | eva1bb | 2.406 | -1.265 | 2.216e-05 | 0.01416 |
| ENSDARG00000097292 | slc26a6 | 0.5613 | 1.652 | 2.386e-05 | 0.01474 |
| ENSDARG00000095717 | si:dkey-172k15.6 | 2.068 | 1.048 | 3.588e-05 | 0.02145 |
| ENSDARG00000028912 | si:dkey-10h3.2 | 1.45 | -1.079 | 4.404e-05 | 0.0255 |
| ENSDARG00000063162 | cmtm8b | 2.698 | 0.7116 | 5.519e-05 | 0.03099 |
| ENSDARG00000046091 | si:ch211-283g2.1 | 2.825 | 2.389 | 6.376e-05 | 0.03475 |
| ENSDARG00000113526 | BX548011.3 | 7.426 | -1.056 | 7.038e-05 | 0.03727 |
| ENSDARG00000073948 | prok1 | 1.758 | 3.299 | 7.623e-05 | 0.03924 |
| ENSDARG00000093650 | si:ch211-287n14.3 | 2.653 | -0.8693 | 8.107e-05 | 0.03951 |
| ENSDARG00000099075 | CABZ01032982.1 | 0.7033 | 1.081 | 8.149e-05 | 0.03951 |
| ENSDARG00000068453 | CABZ01112732.1 | 0.2205 | 2.332 | 8.314e-05 | 0.03951 |
comnGenes <- intersect(
rownames(dgeFilt), rownames(dgeFiltNhi)
)
# Combine counts into the same DGE List
# First extract counts from 2017 DGE List
countsNhi <- dgeListNhi$counts %>%
as.data.frame() %>%
rownames_to_column("Geneid") %>%
as_tibble()
# Then join with 2019 counts
dgeListComb <- full_join(counts, countsNhi) %>%
replace(is.na(.), 0) %>%
as.data.frame() %>%
dplyr::filter(Geneid %in% comnGenes) %>%
column_to_rownames("Geneid") %>%
DGEList() %>%
calcNormFactors()
# Set group variable
dgeListComb$samples$group <- colnames(dgeListComb) %>%
str_extract("(W|Q)") %>%
factor(levels = c("W", "Q"))
# Add genesGR to DGEList using rownames of DGEList to reorder the genesGR
dgeListComb$genes <- genesGR[rownames(dgeListComb),]
# Create logical vector of genes to keep that fit criteria
genes2keepComb <- dgeListComb %>%
cpm() %>%
is_greater_than(1) %>%
rowSums() %>%
is_weakly_greater_than(8)
# Create new DGEList of genes fitting criteria
dgeFiltComb <- dgeListComb[genes2keepComb,,
keep.lib.sizes = FALSE] %>%
calcNormFactors()
# Compare distributions of the DGELists before and after filtering
par(mfrow = c(1,2))
dgeListComb %>%
cpm(log = TRUE) %>%
plotDensities(legend = FALSE, main = "Before Filtering")
dgeFiltComb %>%
cpm(log = TRUE) %>%
plotDensities(legend = FALSE, main = "After Filtering")
par(mfrow = c(1,1))
# Extract logFC from topTables
logFC <- topTable %>%
dplyr::select(Geneid, Symbol, logFCNhi = logFC)
logFCNhi <- topTableNhi %>%
dplyr::select(Geneid, Symbol, logFC)
# Join tables
logFCComb <- full_join(logFC, logFCNhi) %>%
na.omit()
# Plot logFC graph
logFCComb %>%
ggplot(aes(logFC,logFCNhi)) +
geom_point() +
geom_text_repel(
aes(label = Symbol),
data = . %>%
dplyr::filter(abs(logFC) > 2 | abs(logFCNhi) > 2)
) +
geom_abline(slope = 1, intercept = 0, col = "blue") +
geom_vline(xintercept = c(-1, 1), linetype = 2, colour = "grey50") +
geom_hline(yintercept = c(-1, 1), linetype = 2, colour = "grey50") +
labs(title = "LogFC comparison of q96k97del between 2017 and 2019 datasets") +
xlab("2019") + ylab("2017") +
xlim(-4, 6) + ylim(-4, 6) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
# Extract p-values from 2017 and 2019 GSEA results and combine using Fisher's method
p2017Hallmark <- fgseaHallmarkNhi %>%
dplyr::select(pathway, pval) %>%
dplyr::mutate(pval = log10(pval)) %>%
dplyr::rename(log10p2017 = pval)
p2019Hallmark <- fgseaHallmark %>%
dplyr::select(pathway, pval) %>%
dplyr::mutate(pval = log10(pval)) %>%
dplyr::rename(log10p2019 = pval)
fgseaHallmarkMeta <- full_join(p2017Hallmark, p2019Hallmark) %>%
replace(is.na(.), 0) %>%
dplyr::mutate(
chiSquare = -2 * (log10p2017 + log10p2019),
pCombined = pchisq(chiSquare, df = 4, lower.tail = FALSE),
FDR = p.adjust(pCombined, "fdr")
) %>%
dplyr::arrange(pCombined)
# Extract p-values from 2017 and 2019 GSEA results and combine using Fisher's method
p2017KEGG <- fgseaKEGGNhi %>%
dplyr::select(pathway, pval) %>%
dplyr::mutate(pval = log10(pval)) %>%
dplyr::rename(log10p2017 = pval)
p2019KEGG <- fgseaKEGG %>%
dplyr::select(pathway, pval) %>%
dplyr::mutate(pval = log10(pval)) %>%
dplyr::rename(log10p2019 = pval)
fgseaKEGGMeta <- full_join(p2017KEGG, p2019KEGG) %>%
replace(is.na(.), 0) %>%
dplyr::mutate(
chiSquare = -2 * (log10p2017 + log10p2019),
pCombined = pchisq(chiSquare, df = 4, lower.tail = FALSE),
FDR = p.adjust(pCombined, "fdr")
) %>%
dplyr::arrange(pCombined)
# Hallmark
fgseaHallmarkMeta %>%
dplyr::filter(FDR < 0.05) %>%
pander(
style = "rmarkdown",
split.tables = Inf,
justify = "lrrrrr",
caption = paste(
"The", nrow(.), "most significantly enriched hallmark pathways.",
"This corresponds to an FDR of", percent(max(.$FDR)))
)
| pathway | log10p2017 | log10p2019 | chiSquare | pCombined | FDR |
|---|---|---|---|---|---|
| OXIDATIVE_PHOSPHORYLATION | -4.697 | -4.908 | 19.21 | 0.0007152 | 0.03576 |
# KEGG
fgseaKEGGMeta %>%
dplyr::filter(FDR < 0.05) %>%
pander(
style = "rmarkdown",
split.tables = Inf,
justify = "lrrrrr",
caption = paste(
"The", nrow(.), "most significantly enriched KEGG pathways.",
"This corresponds to an FDR of", percent(max(.$FDR)))
)
| pathway | log10p2017 | log10p2019 | chiSquare | pCombined | FDR |
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
pcaComb <- dgeFiltComb %>%
cpm(log = TRUE) %>%
t() %>%
prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pca)$importance %>% pander(split.tables = Inf)
| Â | PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 |
|---|---|---|---|---|---|---|---|---|---|
| Standard deviation | 22.27 | 18.07 | 16.75 | 14.73 | 14.45 | 13.34 | 11.87 | 11.2 | 5.671e-14 |
| Proportion of Variance | 0.2513 | 0.1655 | 0.1421 | 0.1099 | 0.1058 | 0.09023 | 0.07145 | 0.06362 | 0 |
| Cumulative Proportion | 0.2513 | 0.4168 | 0.559 | 0.6689 | 0.7747 | 0.8649 | 0.9364 | 1 | 1 |
# Create PCA plots
pca12 <- pcaComb$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, PC1, PC2) %>%
left_join(rownames_to_column(dgeFiltComb$samples, "sample")) %>%
mutate(
data = case_when(
str_detect(.$sample, "_") == TRUE ~ "2017",
str_detect(.$sample, "_") == FALSE ~ "2019"
)
) %>%
ggplot(aes(PC1, PC2, colour = data, label = sample, shape = group)) +
geom_point() +
geom_text_repel(show.legend = FALSE) +
theme_bw() +
theme(legend.position = c(1.2, 0.5))
pca13 <- pcaComb$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, PC1, PC3) %>%
left_join(rownames_to_column(dgeFiltComb$samples, "sample")) %>%
mutate(
data = case_when(
str_detect(.$sample, "_") == TRUE ~ "2017",
str_detect(.$sample, "_") == FALSE ~ "2019"
)
) %>%
ggplot(aes(PC1, PC3, colour = data, label = sample, shape = group)) +
geom_point() +
geom_text_repel(show.legend = FALSE) +
theme_bw() +
theme(legend.position = "none")
pca23 <- pcaComb$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, PC2, PC3) %>%
left_join(rownames_to_column(dgeFiltComb$samples, "sample")) %>%
mutate(
data = case_when(
str_detect(.$sample, "_") == TRUE ~ "2017",
str_detect(.$sample, "_") == FALSE ~ "2019"
)
) %>%
ggplot(aes(PC2, PC3, colour = data, label = sample, shape = group)) +
geom_point() +
geom_text_repel(show.legend = FALSE) +
theme_bw() +
theme(legend.position = "none")
# Set up grid for visualisation of 3 PCA's at once.
vp1 <- viewport(x = 0.25, y = 0.25, width = 0.5, height = 0.5)
vp2 <- viewport(x = 0.25, y = 0.75, width = 0.5, height = 0.5)
vp3 <- viewport(x = 0.75, y = 0.75, width = 0.5, height = 0.5)
# Plot PCA plots in grid
if (interactive()) grid::grid.newpage()
print(pca12, vp = vp1)
print(pca13, vp = vp2)
print(pca23, vp = vp3)
# Get genes that are common between the 10000 least DE genes of each dataset
negControlComb <- intersect(negControl, negControlNhi)
# Run RUVSeq
RUVgComb <- RUVg(dgeFiltComb$counts, negControlComb, 2)
# Create copy of dgeFilt as framework to replace RUVSeq results into
dgeFalseComb <- dgeFiltComb
# Replace with results
dgeFalseComb$counts <- RUVgComb$normalizedCounts
# Run PCA function
pcaRUVgComb <- dgeFalseComb %>%
cpm(log = TRUE) %>%
t() %>%
prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pcaRUVgComb)$importance %>% pander(split.tables = Inf)
| Â | PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 | PC10 | PC11 | PC12 | PC13 | PC14 | PC15 | PC16 | PC17 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Standard deviation | 53.36 | 12.59 | 11.27 | 9.974 | 9.772 | 9.306 | 8.992 | 8.371 | 8.156 | 7.864 | 7.533 | 7.461 | 7.114 | 6.934 | 6.598 | 0.2287 | 5.267e-14 |
| Proportion of Variance | 0.7211 | 0.04016 | 0.0322 | 0.02519 | 0.02418 | 0.02193 | 0.02048 | 0.01775 | 0.01685 | 0.01566 | 0.01437 | 0.0141 | 0.01282 | 0.01218 | 0.01102 | 1e-05 | 0 |
| Cumulative Proportion | 0.7211 | 0.7612 | 0.7934 | 0.8186 | 0.8428 | 0.8648 | 0.8852 | 0.903 | 0.9198 | 0.9355 | 0.9499 | 0.964 | 0.9768 | 0.989 | 1 | 1 | 1 |
# Plot PCA
pcaRUVgComb$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, PC1, PC2) %>%
left_join(rownames_to_column(dgeFalseComb$samples, "sample")) %>%
mutate(
data = case_when(
str_detect(.$sample, "_") == TRUE ~ "2017",
str_detect(.$sample, "_") == FALSE ~ "2019"
)
) %>%
ggplot(aes(PC1, PC2, colour = data, label = sample, shape = group)) +
geom_point() +
geom_text_repel(show.legend = FALSE) +
labs(
x = paste0("PC1 (", percent(summary(pcaRUVgComb)$importance[2,"PC1"]), ")"),
y = paste0("PC2 (", percent(summary(pcaRUVgComb)$importance[2,"PC2"]), ")")
) +
theme_bw()
# Create copy of DGE List
dgeRUVgComb <- dgeFiltComb
# Bind W_1 from RUVSeq analysis to $samples
dgeRUVgComb$samples %<>% cbind(RUVgComb$W)
# Create design matrix
design <- model.matrix(~group + W_1, data = dgeRUVgComb$samples)
# Perform DE analysis
topTableRUVgComb <- estimateGLMCommonDisp(dgeRUVgComb, design) %>%
estimateGLMTagwiseDisp(design) %>%
glmFit(design) %>%
glmLRT(coef=2) %>%
topTags(n = Inf) %>%
.$table %>%
as_tibble() %>%
unite("Range", ID.start, ID.end, sep = "-") %>%
unite("Location", ID.seqnames, Range, ID.strand, sep = ":") %>%
dplyr::select(
Geneid = ID.gene_id,
Symbol = ID.gene_name,
AveExpr = logCPM, logFC,
P.Value = PValue,
FDR, Location,
Entrez = ID.entrezid
) %>%
mutate(DE = FDR < 0.05)
# Summary of DE genes
topTableRUVgComb %>%
dplyr::filter(FDR < 0.05) %>%
dplyr::select(Geneid, Symbol, AveExpr, logFC, P.Value, FDR) %>%
pander(style = "rmarkdown", split.tables = Inf)
| Geneid | Symbol | AveExpr | logFC | P.Value | FDR |